ref: 9ae6dcb7e063d495267e7a53ddecb4d8356645ba
dir: /src/ext4_journal.c/
/* * Copyright (c) 2015 Grzegorz Kostka (kostka.grzegorz@gmail.com) * Copyright (c) 2015 Kaho Ng (ngkaho1234@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** @addtogroup lwext4 * @{ */ /** * @file ext4_journal.c * @brief Journal handle functions */ #include "ext4_config.h" #include "ext4_types.h" #include "ext4_misc.h" #include "ext4_errno.h" #include "ext4_debug.h" #include "ext4_fs.h" #include "ext4_super.h" #include "ext4_journal.h" #include "ext4_blockdev.h" #include "ext4_crc32.h" #include "ext4_journal.h" #include <string.h> #include <stdlib.h> /**@brief Revoke entry during journal replay.*/ struct revoke_entry { /**@brief Block number not to be replayed.*/ ext4_fsblk_t block; /**@brief For any transaction id smaller * than trans_id, records of @block * in those transactions should not * be replayed.*/ uint32_t trans_id; /**@brief Revoke tree node.*/ RB_ENTRY(revoke_entry) revoke_node; }; /**@brief Valid journal replay information.*/ struct recover_info { /**@brief Starting transaction id.*/ uint32_t start_trans_id; /**@brief Ending transaction id.*/ uint32_t last_trans_id; /**@brief Used as internal argument.*/ uint32_t this_trans_id; /**@brief No of transactions went through.*/ uint32_t trans_cnt; /**@brief RB-Tree storing revoke entries.*/ RB_HEAD(jbd_revoke, revoke_entry) revoke_root; }; /**@brief Journal replay internal arguments.*/ struct replay_arg { /**@brief Journal replay information.*/ struct recover_info *info; /**@brief Current block we are on.*/ uint32_t *this_block; /**@brief Current trans_id we are on.*/ uint32_t this_trans_id; }; /* Make sure we wrap around the log correctly! */ #define wrap(sb, var) \ do { \ if (var >= jbd_get32((sb), maxlen)) \ var -= (jbd_get32((sb), maxlen) - jbd_get32((sb), first)); \ } while (0) static inline int32_t trans_id_diff(uint32_t x, uint32_t y) { int32_t diff = x - y; return diff; } static int jbd_revoke_entry_cmp(struct revoke_entry *a, struct revoke_entry *b) { if (a->block > b->block) return 1; else if (a->block < b->block) return -1; return 0; } static int jbd_block_rec_cmp(struct jbd_block_rec *a, struct jbd_block_rec *b) { if (a->lba > b->lba) return 1; else if (a->lba < b->lba) return -1; return 0; } static int jbd_revoke_rec_cmp(struct jbd_revoke_rec *a, struct jbd_revoke_rec *b) { if (a->lba > b->lba) return 1; else if (a->lba < b->lba) return -1; return 0; } RB_GENERATE_INTERNAL(jbd_revoke, revoke_entry, revoke_node, jbd_revoke_entry_cmp, static inline) RB_GENERATE_INTERNAL(jbd_block, jbd_block_rec, block_rec_node, jbd_block_rec_cmp, static inline) RB_GENERATE_INTERNAL(jbd_revoke_tree, jbd_revoke_rec, revoke_node, jbd_revoke_rec_cmp, static inline) #define jbd_alloc_revoke_entry() ext4_calloc(1, sizeof(struct revoke_entry)) #define jbd_free_revoke_entry(addr) ext4_free(addr) static int jbd_has_csum(struct jbd_sb *jbd_sb) { if (JBD_HAS_INCOMPAT_FEATURE(jbd_sb, JBD_FEATURE_INCOMPAT_CSUM_V2)) return 2; if (JBD_HAS_INCOMPAT_FEATURE(jbd_sb, JBD_FEATURE_INCOMPAT_CSUM_V3)) return 3; return 0; } #if CONFIG_META_CSUM_ENABLE static uint32_t jbd_sb_csum(struct jbd_sb *jbd_sb) { uint32_t checksum = 0; if (jbd_has_csum(jbd_sb)) { uint32_t orig_checksum = jbd_sb->checksum; jbd_set32(jbd_sb, checksum, 0); /* Calculate crc32c checksum against tho whole superblock */ checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_sb, JBD_SUPERBLOCK_SIZE); jbd_sb->checksum = orig_checksum; } return checksum; } #else #define jbd_sb_csum(...) 0 #endif static void jbd_sb_csum_set(struct jbd_sb *jbd_sb) { if (!jbd_has_csum(jbd_sb)) return; jbd_set32(jbd_sb, checksum, jbd_sb_csum(jbd_sb)); } #if CONFIG_META_CSUM_ENABLE static bool jbd_verify_sb_csum(struct jbd_sb *jbd_sb) { if (!jbd_has_csum(jbd_sb)) return true; return jbd_sb_csum(jbd_sb) == jbd_get32(jbd_sb, checksum); } #else #define jbd_verify_sb_csum(...) true #endif #if CONFIG_META_CSUM_ENABLE static uint32_t jbd_meta_csum(struct jbd_fs *jbd_fs, struct jbd_bhdr *bhdr) { uint32_t checksum = 0; if (jbd_has_csum(&jbd_fs->sb)) { uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize); struct jbd_block_tail *tail = (struct jbd_block_tail *)((char *)bhdr + block_size - sizeof(struct jbd_block_tail)); uint32_t orig_checksum = tail->checksum; tail->checksum = 0; /* First calculate crc32c checksum against fs uuid */ checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid, sizeof(jbd_fs->sb.uuid)); /* Calculate crc32c checksum against tho whole block */ checksum = ext4_crc32c(checksum, bhdr, block_size); tail->checksum = orig_checksum; } return checksum; } #else #define jbd_meta_csum(...) 0 #endif static void jbd_meta_csum_set(struct jbd_fs *jbd_fs, struct jbd_bhdr *bhdr) { uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize); struct jbd_block_tail *tail = (struct jbd_block_tail *) ((char *)bhdr + block_size - sizeof(struct jbd_block_tail)); if (!jbd_has_csum(&jbd_fs->sb)) return; tail->checksum = to_be32(jbd_meta_csum(jbd_fs, bhdr)); } #if CONFIG_META_CSUM_ENABLE static bool jbd_verify_meta_csum(struct jbd_fs *jbd_fs, struct jbd_bhdr *bhdr) { uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize); struct jbd_block_tail *tail = (struct jbd_block_tail *) ((char *)bhdr + block_size - sizeof(struct jbd_block_tail)); if (!jbd_has_csum(&jbd_fs->sb)) return true; return jbd_meta_csum(jbd_fs, bhdr) == to_be32(tail->checksum); } #else #define jbd_verify_meta_csum(...) true #endif #if CONFIG_META_CSUM_ENABLE static uint32_t jbd_commit_csum(struct jbd_fs *jbd_fs, struct jbd_commit_header *header) { uint32_t checksum = 0; if (jbd_has_csum(&jbd_fs->sb)) { uint32_t orig_checksum_type = header->chksum_type, orig_checksum_size = header->chksum_size, orig_checksum = header->chksum[0]; uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize); header->chksum_type = 0; header->chksum_size = 0; header->chksum[0] = 0; /* First calculate crc32c checksum against fs uuid */ checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid, sizeof(jbd_fs->sb.uuid)); /* Calculate crc32c checksum against tho whole block */ checksum = ext4_crc32c(checksum, header, block_size); header->chksum_type = orig_checksum_type; header->chksum_size = orig_checksum_size; header->chksum[0] = orig_checksum; } return checksum; } #else #define jbd_commit_csum(...) 0 #endif static void jbd_commit_csum_set(struct jbd_fs *jbd_fs, struct jbd_commit_header *header) { if (!jbd_has_csum(&jbd_fs->sb)) return; header->chksum_type = 0; header->chksum_size = 0; header->chksum[0] = jbd_commit_csum(jbd_fs, header); } #if CONFIG_META_CSUM_ENABLE static bool jbd_verify_commit_csum(struct jbd_fs *jbd_fs, struct jbd_commit_header *header) { if (!jbd_has_csum(&jbd_fs->sb)) return true; return header->chksum[0] == to_be32(jbd_commit_csum(jbd_fs, header)); } #else #define jbd_verify_commit_csum(...) true #endif #if CONFIG_META_CSUM_ENABLE /* * NOTE: We only make use of @csum parameter when * JBD_FEATURE_COMPAT_CHECKSUM is enabled. */ static uint32_t jbd_block_csum(struct jbd_fs *jbd_fs, const void *buf, uint32_t csum, uint32_t sequence) { uint32_t checksum = 0; if (jbd_has_csum(&jbd_fs->sb)) { uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize); /* First calculate crc32c checksum against fs uuid */ checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid, sizeof(jbd_fs->sb.uuid)); /* Then calculate crc32c checksum against sequence no. */ checksum = ext4_crc32c(checksum, &sequence, sizeof(uint32_t)); /* Calculate crc32c checksum against tho whole block */ checksum = ext4_crc32c(checksum, buf, block_size); } else if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_COMPAT_CHECKSUM)) { uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize); /* Calculate crc32c checksum against tho whole block */ checksum = ext4_crc32(csum, buf, block_size); } return checksum; } #else #define jbd_block_csum(...) 0 #endif static void jbd_block_tag_csum_set(struct jbd_fs *jbd_fs, void *__tag, uint32_t checksum) { int ver = jbd_has_csum(&jbd_fs->sb); if (!ver) return; if (ver == 2) { struct jbd_block_tag *tag = __tag; tag->checksum = (uint16_t)to_be32(checksum); } else { struct jbd_block_tag3 *tag = __tag; tag->checksum = to_be32(checksum); } } /**@brief Write jbd superblock to disk. * @param jbd_fs jbd filesystem * @param s jbd superblock * @return standard error code*/ static int jbd_sb_write(struct jbd_fs *jbd_fs, struct jbd_sb *s) { int rc; struct ext4_fs *fs = jbd_fs->inode_ref.fs; uint64_t offset; ext4_fsblk_t fblock; rc = jbd_inode_bmap(jbd_fs, 0, &fblock); if (rc != EOK) return rc; jbd_sb_csum_set(s); offset = fblock * ext4_sb_get_block_size(&fs->sb); return ext4_block_writebytes(fs->bdev, offset, s, EXT4_SUPERBLOCK_SIZE); } /**@brief Read jbd superblock from disk. * @param jbd_fs jbd filesystem * @param s jbd superblock * @return standard error code*/ static int jbd_sb_read(struct jbd_fs *jbd_fs, struct jbd_sb *s) { int rc; struct ext4_fs *fs = jbd_fs->inode_ref.fs; uint64_t offset; ext4_fsblk_t fblock; rc = jbd_inode_bmap(jbd_fs, 0, &fblock); if (rc != EOK) return rc; offset = fblock * ext4_sb_get_block_size(&fs->sb); return ext4_block_readbytes(fs->bdev, offset, s, EXT4_SUPERBLOCK_SIZE); } /**@brief Verify jbd superblock. * @param sb jbd superblock * @return true if jbd superblock is valid */ static bool jbd_verify_sb(struct jbd_sb *sb) { struct jbd_bhdr *header = &sb->header; if (jbd_get32(header, magic) != JBD_MAGIC_NUMBER) return false; if (jbd_get32(header, blocktype) != JBD_SUPERBLOCK && jbd_get32(header, blocktype) != JBD_SUPERBLOCK_V2) return false; return jbd_verify_sb_csum(sb); } /**@brief Write back dirty jbd superblock to disk. * @param jbd_fs jbd filesystem * @return standard error code*/ static int jbd_write_sb(struct jbd_fs *jbd_fs) { int rc = EOK; if (jbd_fs->dirty) { rc = jbd_sb_write(jbd_fs, &jbd_fs->sb); if (rc != EOK) return rc; jbd_fs->dirty = false; } return rc; } /**@brief Get reference to jbd filesystem. * @param fs Filesystem to load journal of * @param jbd_fs jbd filesystem * @return standard error code*/ int jbd_get_fs(struct ext4_fs *fs, struct jbd_fs *jbd_fs) { int rc; uint32_t journal_ino; memset(jbd_fs, 0, sizeof(struct jbd_fs)); /* See if there is journal inode on this filesystem.*/ /* FIXME: detection on existance ofbkejournal bdev is * missing.*/ journal_ino = ext4_get32(&fs->sb, journal_inode_number); rc = ext4_fs_get_inode_ref(fs, journal_ino, &jbd_fs->inode_ref); if (rc != EOK) { memset(jbd_fs, 0, sizeof(struct jbd_fs)); return rc; } rc = jbd_sb_read(jbd_fs, &jbd_fs->sb); if (rc != EOK) { memset(jbd_fs, 0, sizeof(struct jbd_fs)); ext4_fs_put_inode_ref(&jbd_fs->inode_ref); return rc; } if (!jbd_verify_sb(&jbd_fs->sb)) { memset(jbd_fs, 0, sizeof(struct jbd_fs)); ext4_fs_put_inode_ref(&jbd_fs->inode_ref); rc = EIO; } if (rc == EOK) jbd_fs->bdev = fs->bdev; return rc; } /**@brief Put reference of jbd filesystem. * @param jbd_fs jbd filesystem * @return standard error code*/ int jbd_put_fs(struct jbd_fs *jbd_fs) { int rc = EOK; rc = jbd_write_sb(jbd_fs); ext4_fs_put_inode_ref(&jbd_fs->inode_ref); return rc; } /**@brief Data block lookup helper. * @param jbd_fs jbd filesystem * @param iblock block index * @param fblock logical block address * @return standard error code*/ int jbd_inode_bmap(struct jbd_fs *jbd_fs, ext4_lblk_t iblock, ext4_fsblk_t *fblock) { int rc = ext4_fs_get_inode_dblk_idx( &jbd_fs->inode_ref, iblock, fblock, false); return rc; } /**@brief jbd block get function (through cache). * @param jbd_fs jbd filesystem * @param block block descriptor * @param fblock jbd logical block address * @return standard error code*/ static int jbd_block_get(struct jbd_fs *jbd_fs, struct ext4_block *block, ext4_fsblk_t fblock) { /* TODO: journal device. */ int rc; struct ext4_blockdev *bdev = jbd_fs->bdev; ext4_lblk_t iblock = (ext4_lblk_t)fblock; /* Lookup the logical block address of * fblock.*/ rc = jbd_inode_bmap(jbd_fs, iblock, &fblock); if (rc != EOK) return rc; rc = ext4_block_get(bdev, block, fblock); /* If succeeded, mark buffer as BC_FLUSH to indicate * that data should be written to disk immediately.*/ if (rc == EOK) { ext4_bcache_set_flag(block->buf, BC_FLUSH); /* As we don't want to occupy too much space * in block cache, we set this buffer BC_TMP.*/ ext4_bcache_set_flag(block->buf, BC_TMP); } return rc; } /**@brief jbd block get function (through cache, don't read). * @param jbd_fs jbd filesystem * @param block block descriptor * @param fblock jbd logical block address * @return standard error code*/ static int jbd_block_get_noread(struct jbd_fs *jbd_fs, struct ext4_block *block, ext4_fsblk_t fblock) { /* TODO: journal device. */ int rc; struct ext4_blockdev *bdev = jbd_fs->bdev; ext4_lblk_t iblock = (ext4_lblk_t)fblock; rc = jbd_inode_bmap(jbd_fs, iblock, &fblock); if (rc != EOK) return rc; rc = ext4_block_get_noread(bdev, block, fblock); if (rc == EOK) ext4_bcache_set_flag(block->buf, BC_FLUSH); return rc; } /**@brief jbd block set procedure (through cache). * @param jbd_fs jbd filesystem * @param block block descriptor * @return standard error code*/ static int jbd_block_set(struct jbd_fs *jbd_fs, struct ext4_block *block) { struct ext4_blockdev *bdev = jbd_fs->bdev; return ext4_block_set(bdev, block); } /**@brief helper functions to calculate * block tag size, not including UUID part. * @param jbd_fs jbd filesystem * @return tag size in bytes*/ static int jbd_tag_bytes(struct jbd_fs *jbd_fs) { int size; /* It is very easy to deal with the case which * JBD_FEATURE_INCOMPAT_CSUM_V3 is enabled.*/ if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_CSUM_V3)) return sizeof(struct jbd_block_tag3); size = sizeof(struct jbd_block_tag); /* If JBD_FEATURE_INCOMPAT_CSUM_V2 is enabled, * add 2 bytes to size.*/ if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_CSUM_V2)) size += sizeof(uint16_t); if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_64BIT)) return size; /* If block number is 4 bytes in size, * minus 4 bytes from size */ return size - sizeof(uint32_t); } /**@brief Tag information. */ struct tag_info { /**@brief Tag size in bytes, including UUID part.*/ int tag_bytes; /**@brief block number stored in this tag.*/ ext4_fsblk_t block; /**@brief Is the first 4 bytes of block equals to * JBD_MAGIC_NUMBER? */ bool is_escape; /**@brief whether UUID part exists or not.*/ bool uuid_exist; /**@brief UUID content if UUID part exists.*/ uint8_t uuid[UUID_SIZE]; /**@brief Is this the last tag? */ bool last_tag; /**@brief crc32c checksum. */ uint32_t checksum; }; /**@brief Extract information from a block tag. * @param __tag pointer to the block tag * @param tag_bytes block tag size of this jbd filesystem * @param remaining size in buffer containing the block tag * @param tag_info information of this tag. * @return EOK when succeed, otherwise return EINVAL.*/ static int jbd_extract_block_tag(struct jbd_fs *jbd_fs, void *__tag, int tag_bytes, int32_t remain_buf_size, struct tag_info *tag_info) { char *uuid_start; tag_info->tag_bytes = tag_bytes; tag_info->uuid_exist = false; tag_info->last_tag = false; tag_info->is_escape = false; /* See whether it is possible to hold a valid block tag.*/ if (remain_buf_size - tag_bytes < 0) return EINVAL; if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_CSUM_V3)) { struct jbd_block_tag3 *tag = __tag; tag_info->block = jbd_get32(tag, blocknr); if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_64BIT)) tag_info->block |= (uint64_t)jbd_get32(tag, blocknr_high) << 32; if (jbd_get32(tag, flags) & JBD_FLAG_ESCAPE) tag_info->is_escape = true; if (!(jbd_get32(tag, flags) & JBD_FLAG_SAME_UUID)) { /* See whether it is possible to hold UUID part.*/ if (remain_buf_size - tag_bytes < UUID_SIZE) return EINVAL; uuid_start = (char *)tag + tag_bytes; tag_info->uuid_exist = true; tag_info->tag_bytes += UUID_SIZE; memcpy(tag_info->uuid, uuid_start, UUID_SIZE); } if (jbd_get32(tag, flags) & JBD_FLAG_LAST_TAG) tag_info->last_tag = true; } else { struct jbd_block_tag *tag = __tag; tag_info->block = jbd_get32(tag, blocknr); if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_64BIT)) tag_info->block |= (uint64_t)jbd_get32(tag, blocknr_high) << 32; if (jbd_get16(tag, flags) & JBD_FLAG_ESCAPE) tag_info->is_escape = true; if (!(jbd_get16(tag, flags) & JBD_FLAG_SAME_UUID)) { /* See whether it is possible to hold UUID part.*/ if (remain_buf_size - tag_bytes < UUID_SIZE) return EINVAL; uuid_start = (char *)tag + tag_bytes; tag_info->uuid_exist = true; tag_info->tag_bytes += UUID_SIZE; memcpy(tag_info->uuid, uuid_start, UUID_SIZE); } if (jbd_get16(tag, flags) & JBD_FLAG_LAST_TAG) tag_info->last_tag = true; } return EOK; } /**@brief Write information to a block tag. * @param __tag pointer to the block tag * @param remaining size in buffer containing the block tag * @param tag_info information of this tag. * @return EOK when succeed, otherwise return EINVAL.*/ static int jbd_write_block_tag(struct jbd_fs *jbd_fs, void *__tag, int32_t remain_buf_size, struct tag_info *tag_info) { char *uuid_start; int tag_bytes = jbd_tag_bytes(jbd_fs); tag_info->tag_bytes = tag_bytes; /* See whether it is possible to hold a valid block tag.*/ if (remain_buf_size - tag_bytes < 0) return EINVAL; if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_CSUM_V3)) { struct jbd_block_tag3 *tag = __tag; memset(tag, 0, sizeof(struct jbd_block_tag3)); jbd_set32(tag, blocknr, (uint32_t)tag_info->block); if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_64BIT)) jbd_set32(tag, blocknr_high, tag_info->block >> 32); if (tag_info->uuid_exist) { /* See whether it is possible to hold UUID part.*/ if (remain_buf_size - tag_bytes < UUID_SIZE) return EINVAL; uuid_start = (char *)tag + tag_bytes; tag_info->tag_bytes += UUID_SIZE; memcpy(uuid_start, tag_info->uuid, UUID_SIZE); } else jbd_set32(tag, flags, jbd_get32(tag, flags) | JBD_FLAG_SAME_UUID); jbd_block_tag_csum_set(jbd_fs, __tag, tag_info->checksum); if (tag_info->last_tag) jbd_set32(tag, flags, jbd_get32(tag, flags) | JBD_FLAG_LAST_TAG); if (tag_info->is_escape) jbd_set32(tag, flags, jbd_get32(tag, flags) | JBD_FLAG_ESCAPE); } else { struct jbd_block_tag *tag = __tag; memset(tag, 0, sizeof(struct jbd_block_tag)); jbd_set32(tag, blocknr, (uint32_t)tag_info->block); if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_64BIT)) jbd_set32(tag, blocknr_high, tag_info->block >> 32); if (tag_info->uuid_exist) { /* See whether it is possible to hold UUID part.*/ if (remain_buf_size - tag_bytes < UUID_SIZE) return EINVAL; uuid_start = (char *)tag + tag_bytes; tag_info->tag_bytes += UUID_SIZE; memcpy(uuid_start, tag_info->uuid, UUID_SIZE); } else jbd_set16(tag, flags, jbd_get16(tag, flags) | JBD_FLAG_SAME_UUID); jbd_block_tag_csum_set(jbd_fs, __tag, tag_info->checksum); if (tag_info->last_tag) jbd_set16(tag, flags, jbd_get16(tag, flags) | JBD_FLAG_LAST_TAG); if (tag_info->is_escape) jbd_set16(tag, flags, jbd_get16(tag, flags) | JBD_FLAG_ESCAPE); } return EOK; } /**@brief Iterate all block tags in a block. * @param jbd_fs jbd filesystem * @param __tag_start pointer to the block * @param tag_tbl_size size of the block * @param func callback routine to indicate that * a block tag is found * @param arg additional argument to be passed to func */ static void jbd_iterate_block_table(struct jbd_fs *jbd_fs, void *__tag_start, int32_t tag_tbl_size, void (*func)(struct jbd_fs * jbd_fs, struct tag_info *tag_info, void *arg), void *arg) { char *tag_start, *tag_ptr; int tag_bytes = jbd_tag_bytes(jbd_fs); tag_start = __tag_start; tag_ptr = tag_start; /* Cut off the size of block tail storing checksum. */ if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_CSUM_V2) || JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_CSUM_V3)) tag_tbl_size -= sizeof(struct jbd_block_tail); while (tag_tbl_size) { struct tag_info tag_info; int rc = jbd_extract_block_tag(jbd_fs, tag_ptr, tag_bytes, tag_tbl_size, &tag_info); if (rc != EOK) break; if (func) func(jbd_fs, &tag_info, arg); /* Stop the iteration when we reach the last tag. */ if (tag_info.last_tag) break; tag_ptr += tag_info.tag_bytes; tag_tbl_size -= tag_info.tag_bytes; } } static void jbd_display_block_tags(struct jbd_fs *jbd_fs, struct tag_info *tag_info, void *arg) { uint32_t *iblock = arg; ext4_dbg(DEBUG_JBD, "Block in block_tag: %" PRIu64 "\n", tag_info->block); (*iblock)++; wrap(&jbd_fs->sb, *iblock); (void)jbd_fs; return; } static struct revoke_entry * jbd_revoke_entry_lookup(struct recover_info *info, ext4_fsblk_t block) { struct revoke_entry tmp = { .block = block }; return RB_FIND(jbd_revoke, &info->revoke_root, &tmp); } /**@brief Replay a block in a transaction. * @param jbd_fs jbd filesystem * @param tag_info tag_info of the logged block.*/ static void jbd_replay_block_tags(struct jbd_fs *jbd_fs, struct tag_info *tag_info, void *__arg) { int r; struct replay_arg *arg = __arg; struct recover_info *info = arg->info; uint32_t *this_block = arg->this_block; struct revoke_entry *revoke_entry; struct ext4_block journal_block, ext4_block; struct ext4_fs *fs = jbd_fs->inode_ref.fs; (*this_block)++; wrap(&jbd_fs->sb, *this_block); /* We replay this block only if the current transaction id * is equal or greater than that in revoke entry.*/ revoke_entry = jbd_revoke_entry_lookup(info, tag_info->block); if (revoke_entry && trans_id_diff(arg->this_trans_id, revoke_entry->trans_id) <= 0) return; ext4_dbg(DEBUG_JBD, "Replaying block in block_tag: %" PRIu64 "\n", tag_info->block); r = jbd_block_get(jbd_fs, &journal_block, *this_block); if (r != EOK) return; /* We need special treatment for ext4 superblock. */ if (tag_info->block) { r = ext4_block_get_noread(fs->bdev, &ext4_block, tag_info->block); if (r != EOK) { jbd_block_set(jbd_fs, &journal_block); return; } memcpy(ext4_block.data, journal_block.data, jbd_get32(&jbd_fs->sb, blocksize)); if (tag_info->is_escape) ((struct jbd_bhdr *)ext4_block.data)->magic = to_be32(JBD_MAGIC_NUMBER); ext4_bcache_set_dirty(ext4_block.buf); ext4_block_set(fs->bdev, &ext4_block); } else { uint16_t mount_count, state; mount_count = ext4_get16(&fs->sb, mount_count); state = ext4_get16(&fs->sb, state); memcpy(&fs->sb, journal_block.data + EXT4_SUPERBLOCK_OFFSET, EXT4_SUPERBLOCK_SIZE); /* Mark system as mounted */ ext4_set16(&fs->sb, state, state); r = ext4_sb_write(fs->bdev, &fs->sb); if (r != EOK) return; /*Update mount count*/ ext4_set16(&fs->sb, mount_count, mount_count); } jbd_block_set(jbd_fs, &journal_block); return; } /**@brief Add block address to revoke tree, along with * its transaction id. * @param info journal replay info * @param block block address to be replayed.*/ static void jbd_add_revoke_block_tags(struct recover_info *info, ext4_fsblk_t block) { struct revoke_entry *revoke_entry; ext4_dbg(DEBUG_JBD, "Add block %" PRIu64 " to revoke tree\n", block); /* If the revoke entry with respect to the block address * exists already, update its transaction id.*/ revoke_entry = jbd_revoke_entry_lookup(info, block); if (revoke_entry) { revoke_entry->trans_id = info->this_trans_id; return; } revoke_entry = jbd_alloc_revoke_entry(); ext4_assert(revoke_entry); revoke_entry->block = block; revoke_entry->trans_id = info->this_trans_id; RB_INSERT(jbd_revoke, &info->revoke_root, revoke_entry); return; } static void jbd_destroy_revoke_tree(struct recover_info *info) { while (!RB_EMPTY(&info->revoke_root)) { struct revoke_entry *revoke_entry = RB_MIN(jbd_revoke, &info->revoke_root); ext4_assert(revoke_entry); RB_REMOVE(jbd_revoke, &info->revoke_root, revoke_entry); jbd_free_revoke_entry(revoke_entry); } } #define ACTION_SCAN 0 #define ACTION_REVOKE 1 #define ACTION_RECOVER 2 /**@brief Add entries in a revoke block to revoke tree. * @param jbd_fs jbd filesystem * @param header revoke block header * @param recover_info journal replay info*/ static void jbd_build_revoke_tree(struct jbd_fs *jbd_fs, struct jbd_bhdr *header, struct recover_info *info) { char *blocks_entry; struct jbd_revoke_header *revoke_hdr = (struct jbd_revoke_header *)header; uint32_t i, nr_entries, record_len = 4; /* If we are working on a 64bit jbd filesystem, */ if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb, JBD_FEATURE_INCOMPAT_64BIT)) record_len = 8; nr_entries = (jbd_get32(revoke_hdr, count) - sizeof(struct jbd_revoke_header)) / record_len; blocks_entry = (char *)(revoke_hdr + 1); for (i = 0;i < nr_entries;i++) { if (record_len == 8) { uint64_t *blocks = (uint64_t *)blocks_entry; jbd_add_revoke_block_tags(info, to_be64(*blocks)); } else { uint32_t *blocks = (uint32_t *)blocks_entry; jbd_add_revoke_block_tags(info, to_be32(*blocks)); } blocks_entry += record_len; } } static void jbd_debug_descriptor_block(struct jbd_fs *jbd_fs, struct jbd_bhdr *header, uint32_t *iblock) { jbd_iterate_block_table(jbd_fs, header + 1, jbd_get32(&jbd_fs->sb, blocksize) - sizeof(struct jbd_bhdr), jbd_display_block_tags, iblock); } static void jbd_replay_descriptor_block(struct jbd_fs *jbd_fs, struct jbd_bhdr *header, struct replay_arg *arg) { jbd_iterate_block_table(jbd_fs, header + 1, jbd_get32(&jbd_fs->sb, blocksize) - sizeof(struct jbd_bhdr), jbd_replay_block_tags, arg); } /**@brief The core routine of journal replay. * @param jbd_fs jbd filesystem * @param recover_info journal replay info * @param action action needed to be taken * @return standard error code*/ static int jbd_iterate_log(struct jbd_fs *jbd_fs, struct recover_info *info, int action) { int r = EOK; bool log_end = false; struct jbd_sb *sb = &jbd_fs->sb; uint32_t start_trans_id, this_trans_id; uint32_t start_block, this_block; /* We start iterating valid blocks in the whole journal.*/ start_trans_id = this_trans_id = jbd_get32(sb, sequence); start_block = this_block = jbd_get32(sb, start); if (action == ACTION_SCAN) info->trans_cnt = 0; else if (!info->trans_cnt) log_end = true; ext4_dbg(DEBUG_JBD, "Start of journal at trans id: %" PRIu32 "\n", start_trans_id); while (!log_end) { struct ext4_block block; struct jbd_bhdr *header; /* If we are not scanning for the last * valid transaction in the journal, * we will stop when we reach the end of * the journal.*/ if (action != ACTION_SCAN) if (trans_id_diff(this_trans_id, info->last_trans_id) > 0) { log_end = true; continue; } r = jbd_block_get(jbd_fs, &block, this_block); if (r != EOK) break; header = (struct jbd_bhdr *)block.data; /* This block does not have a valid magic number, * so we have reached the end of the journal.*/ if (jbd_get32(header, magic) != JBD_MAGIC_NUMBER) { jbd_block_set(jbd_fs, &block); log_end = true; continue; } /* If the transaction id we found is not expected, * we may have reached the end of the journal. * * If we are not scanning the journal, something * bad might have taken place. :-( */ if (jbd_get32(header, sequence) != this_trans_id) { if (action != ACTION_SCAN) r = EIO; jbd_block_set(jbd_fs, &block); log_end = true; continue; } switch (jbd_get32(header, blocktype)) { case JBD_DESCRIPTOR_BLOCK: if (!jbd_verify_meta_csum(jbd_fs, header)) { ext4_dbg(DEBUG_JBD, DBG_WARN "Descriptor block checksum failed." "Journal block: %" PRIu32"\n", this_block); log_end = true; break; } ext4_dbg(DEBUG_JBD, "Descriptor block: %" PRIu32", " "trans_id: %" PRIu32"\n", this_block, this_trans_id); if (action == ACTION_RECOVER) { struct replay_arg replay_arg; replay_arg.info = info; replay_arg.this_block = &this_block; replay_arg.this_trans_id = this_trans_id; jbd_replay_descriptor_block(jbd_fs, header, &replay_arg); } else jbd_debug_descriptor_block(jbd_fs, header, &this_block); break; case JBD_COMMIT_BLOCK: if (!jbd_verify_commit_csum(jbd_fs, (struct jbd_commit_header *)header)) { ext4_dbg(DEBUG_JBD, DBG_WARN "Commit block checksum failed." "Journal block: %" PRIu32"\n", this_block); log_end = true; break; } ext4_dbg(DEBUG_JBD, "Commit block: %" PRIu32", " "trans_id: %" PRIu32"\n", this_block, this_trans_id); /* * This is the end of a transaction, * we may now proceed to the next transaction. */ this_trans_id++; if (action == ACTION_SCAN) info->trans_cnt++; break; case JBD_REVOKE_BLOCK: if (!jbd_verify_meta_csum(jbd_fs, header)) { ext4_dbg(DEBUG_JBD, DBG_WARN "Revoke block checksum failed." "Journal block: %" PRIu32"\n", this_block); log_end = true; break; } ext4_dbg(DEBUG_JBD, "Revoke block: %" PRIu32", " "trans_id: %" PRIu32"\n", this_block, this_trans_id); if (action == ACTION_REVOKE) { info->this_trans_id = this_trans_id; jbd_build_revoke_tree(jbd_fs, header, info); } break; default: log_end = true; break; } jbd_block_set(jbd_fs, &block); this_block++; wrap(sb, this_block); if (this_block == start_block) log_end = true; } ext4_dbg(DEBUG_JBD, "End of journal.\n"); if (r == EOK && action == ACTION_SCAN) { /* We have finished scanning the journal. */ info->start_trans_id = start_trans_id; if (trans_id_diff(this_trans_id, start_trans_id) > 0) info->last_trans_id = this_trans_id - 1; else info->last_trans_id = this_trans_id; } return r; } /**@brief Replay journal. * @param jbd_fs jbd filesystem * @return standard error code*/ int jbd_recover(struct jbd_fs *jbd_fs) { int r; struct recover_info info; struct jbd_sb *sb = &jbd_fs->sb; if (!sb->start) return EOK; RB_INIT(&info.revoke_root); r = jbd_iterate_log(jbd_fs, &info, ACTION_SCAN); if (r != EOK) return r; r = jbd_iterate_log(jbd_fs, &info, ACTION_REVOKE); if (r != EOK) return r; r = jbd_iterate_log(jbd_fs, &info, ACTION_RECOVER); if (r == EOK) { /* If we successfully replay the journal, * clear EXT4_FINCOM_RECOVER flag on the * ext4 superblock, and set the start of * journal to 0.*/ uint32_t features_incompatible = ext4_get32(&jbd_fs->inode_ref.fs->sb, features_incompatible); jbd_set32(&jbd_fs->sb, start, 0); jbd_set32(&jbd_fs->sb, sequence, info.last_trans_id); features_incompatible &= ~EXT4_FINCOM_RECOVER; ext4_set32(&jbd_fs->inode_ref.fs->sb, features_incompatible, features_incompatible); jbd_fs->dirty = true; r = ext4_sb_write(jbd_fs->bdev, &jbd_fs->inode_ref.fs->sb); } jbd_destroy_revoke_tree(&info); return r; } static void jbd_journal_write_sb(struct jbd_journal *journal) { struct jbd_fs *jbd_fs = journal->jbd_fs; jbd_set32(&jbd_fs->sb, start, journal->start); jbd_set32(&jbd_fs->sb, sequence, journal->trans_id); jbd_fs->dirty = true; } /**@brief Start accessing the journal. * @param jbd_fs jbd filesystem * @param journal current journal session * @return standard error code*/ int jbd_journal_start(struct jbd_fs *jbd_fs, struct jbd_journal *journal) { int r; uint32_t features_incompatible = ext4_get32(&jbd_fs->inode_ref.fs->sb, features_incompatible); features_incompatible |= EXT4_FINCOM_RECOVER; ext4_set32(&jbd_fs->inode_ref.fs->sb, features_incompatible, features_incompatible); r = ext4_sb_write(jbd_fs->bdev, &jbd_fs->inode_ref.fs->sb); if (r != EOK) return r; journal->first = jbd_get32(&jbd_fs->sb, first); journal->start = journal->first; journal->last = journal->first; /* * To invalidate any stale records we need to start from * the checkpoint transaction ID of the previous journalling session * plus 1. */ journal->trans_id = jbd_get32(&jbd_fs->sb, sequence) + 1; journal->alloc_trans_id = journal->trans_id; journal->block_size = jbd_get32(&jbd_fs->sb, blocksize); TAILQ_INIT(&journal->cp_queue); RB_INIT(&journal->block_rec_root); journal->jbd_fs = jbd_fs; jbd_journal_write_sb(journal); r = jbd_write_sb(jbd_fs); if (r != EOK) return r; jbd_fs->bdev->journal = journal; return EOK; } static void jbd_trans_end_write(struct ext4_bcache *bc __unused, struct ext4_buf *buf __unused, int res, void *arg); /* * This routine is only suitable to committed transactions. */ static void jbd_journal_flush_trans(struct jbd_trans *trans) { struct jbd_buf *jbd_buf, *tmp; struct jbd_journal *journal = trans->journal; struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs; void *tmp_data = ext4_malloc(journal->block_size); ext4_assert(tmp_data); TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node, tmp) { struct ext4_buf *buf; struct ext4_block block; /* The buffer is not yet flushed. */ buf = ext4_bcache_find_get(fs->bdev->bc, &block, jbd_buf->block_rec->lba); if (!(buf && ext4_bcache_test_flag(buf, BC_UPTODATE) && jbd_buf->block_rec->trans == trans)) { int r; struct ext4_block jbd_block = EXT4_BLOCK_ZERO(); ext4_assert(jbd_block_get(journal->jbd_fs, &jbd_block, jbd_buf->jbd_lba) == EOK); memcpy(tmp_data, jbd_block.data, journal->block_size); ext4_block_set(fs->bdev, &jbd_block); r = ext4_blocks_set_direct(fs->bdev, tmp_data, jbd_buf->block_rec->lba, 1); jbd_trans_end_write(fs->bdev->bc, buf, r, jbd_buf); } else ext4_block_flush_buf(fs->bdev, buf); if (buf) ext4_block_set(fs->bdev, &block); } ext4_free(tmp_data); } static void jbd_journal_skip_pure_revoke(struct jbd_journal *journal, struct jbd_trans *trans) { journal->start = trans->start_iblock + trans->alloc_blocks; wrap(&journal->jbd_fs->sb, journal->start); journal->trans_id = trans->trans_id + 1; jbd_journal_free_trans(journal, trans, false); jbd_journal_write_sb(journal); } void jbd_journal_purge_cp_trans(struct jbd_journal *journal, bool flush, bool once) { struct jbd_trans *trans; while ((trans = TAILQ_FIRST(&journal->cp_queue))) { if (!trans->data_cnt) { TAILQ_REMOVE(&journal->cp_queue, trans, trans_node); jbd_journal_skip_pure_revoke(journal, trans); } else { if (trans->data_cnt == trans->written_cnt) { journal->start = trans->start_iblock + trans->alloc_blocks; wrap(&journal->jbd_fs->sb, journal->start); journal->trans_id = trans->trans_id + 1; TAILQ_REMOVE(&journal->cp_queue, trans, trans_node); jbd_journal_free_trans(journal, trans, false); jbd_journal_write_sb(journal); } else if (!flush) { journal->start = trans->start_iblock; wrap(&journal->jbd_fs->sb, journal->start); journal->trans_id = trans->trans_id; jbd_journal_write_sb(journal); break; } else jbd_journal_flush_trans(trans); } if (once) break; } } /**@brief Stop accessing the journal. * @param journal current journal session * @return standard error code*/ int jbd_journal_stop(struct jbd_journal *journal) { int r; struct jbd_fs *jbd_fs = journal->jbd_fs; uint32_t features_incompatible; /* Make sure that journalled content have reached * the disk.*/ jbd_journal_purge_cp_trans(journal, true, false); /* There should be no block record in this journal * session. */ if (!RB_EMPTY(&journal->block_rec_root)) ext4_dbg(DEBUG_JBD, DBG_WARN "There are still block records " "in this journal session!\n"); features_incompatible = ext4_get32(&jbd_fs->inode_ref.fs->sb, features_incompatible); features_incompatible &= ~EXT4_FINCOM_RECOVER; ext4_set32(&jbd_fs->inode_ref.fs->sb, features_incompatible, features_incompatible); r = ext4_sb_write(jbd_fs->bdev, &jbd_fs->inode_ref.fs->sb); if (r != EOK) return r; journal->start = 0; journal->trans_id = 0; jbd_journal_write_sb(journal); return jbd_write_sb(journal->jbd_fs); } /**@brief Allocate a block in the journal. * @param journal current journal session * @param trans transaction * @return allocated block address*/ static uint32_t jbd_journal_alloc_block(struct jbd_journal *journal, struct jbd_trans *trans) { uint32_t start_block; start_block = journal->last++; trans->alloc_blocks++; wrap(&journal->jbd_fs->sb, journal->last); /* If there is no space left, flush just one journalled * transaction.*/ if (journal->last == journal->start) { jbd_journal_purge_cp_trans(journal, true, true); ext4_assert(journal->last != journal->start); } return start_block; } static struct jbd_block_rec * jbd_trans_block_rec_lookup(struct jbd_journal *journal, ext4_fsblk_t lba) { struct jbd_block_rec tmp = { .lba = lba }; return RB_FIND(jbd_block, &journal->block_rec_root, &tmp); } static void jbd_trans_change_ownership(struct jbd_block_rec *block_rec, struct jbd_trans *new_trans) { LIST_REMOVE(block_rec, tbrec_node); if (new_trans) { /* Now this block record belongs to this transaction. */ LIST_INSERT_HEAD(&new_trans->tbrec_list, block_rec, tbrec_node); } block_rec->trans = new_trans; } static inline struct jbd_block_rec * jbd_trans_insert_block_rec(struct jbd_trans *trans, ext4_fsblk_t lba) { struct jbd_block_rec *block_rec; block_rec = jbd_trans_block_rec_lookup(trans->journal, lba); if (block_rec) { jbd_trans_change_ownership(block_rec, trans); return block_rec; } block_rec = ext4_calloc(1, sizeof(struct jbd_block_rec)); if (!block_rec) return NULL; block_rec->lba = lba; block_rec->trans = trans; TAILQ_INIT(&block_rec->dirty_buf_queue); LIST_INSERT_HEAD(&trans->tbrec_list, block_rec, tbrec_node); RB_INSERT(jbd_block, &trans->journal->block_rec_root, block_rec); return block_rec; } /* * This routine will do the dirty works. */ static void jbd_trans_finish_callback(struct jbd_journal *journal, const struct jbd_trans *trans, struct jbd_block_rec *block_rec, bool abort, bool revoke) { struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs; if (block_rec->trans != trans) return; if (!abort) { struct jbd_buf *jbd_buf, *tmp; TAILQ_FOREACH_SAFE(jbd_buf, &block_rec->dirty_buf_queue, dirty_buf_node, tmp) { jbd_trans_end_write(fs->bdev->bc, NULL, EOK, jbd_buf); } } else { /* * We have to roll back data if the block is going to be * aborted. */ struct jbd_buf *jbd_buf; struct ext4_block jbd_block = EXT4_BLOCK_ZERO(), block = EXT4_BLOCK_ZERO(); jbd_buf = TAILQ_LAST(&block_rec->dirty_buf_queue, jbd_buf_dirty); if (jbd_buf) { if (!revoke) { ext4_assert(ext4_block_get_noread(fs->bdev, &block, block_rec->lba) == EOK); ext4_assert(jbd_block_get(journal->jbd_fs, &jbd_block, jbd_buf->jbd_lba) == EOK); memcpy(block.data, jbd_block.data, journal->block_size); jbd_trans_change_ownership(block_rec, jbd_buf->trans); block.buf->end_write = jbd_trans_end_write; block.buf->end_write_arg = jbd_buf; ext4_bcache_set_flag(jbd_block.buf, BC_TMP); ext4_bcache_set_dirty(block.buf); ext4_block_set(fs->bdev, &jbd_block); ext4_block_set(fs->bdev, &block); return; } else { /* The revoked buffer is yet written. */ jbd_trans_change_ownership(block_rec, jbd_buf->trans); } } } } static inline void jbd_trans_remove_block_rec(struct jbd_journal *journal, struct jbd_block_rec *block_rec, struct jbd_trans *trans) { /* If this block record doesn't belong to this transaction, * give up.*/ if (block_rec->trans == trans) { LIST_REMOVE(block_rec, tbrec_node); RB_REMOVE(jbd_block, &journal->block_rec_root, block_rec); ext4_free(block_rec); } } /**@brief Add block to a transaction and mark it dirty. * @param trans transaction * @param block block descriptor * @return standard error code*/ int jbd_trans_set_block_dirty(struct jbd_trans *trans, struct ext4_block *block) { struct jbd_buf *jbd_buf; struct jbd_revoke_rec *rec, tmp_rec = { .lba = block->lb_id }; struct jbd_block_rec *block_rec; if (block->buf->end_write == jbd_trans_end_write) { jbd_buf = block->buf->end_write_arg; if (jbd_buf && jbd_buf->trans == trans) return EOK; } jbd_buf = ext4_calloc(1, sizeof(struct jbd_buf)); if (!jbd_buf) return ENOMEM; if ((block_rec = jbd_trans_insert_block_rec(trans, block->lb_id)) == NULL) { ext4_free(jbd_buf); return ENOMEM; } TAILQ_INSERT_TAIL(&block_rec->dirty_buf_queue, jbd_buf, dirty_buf_node); jbd_buf->block_rec = block_rec; jbd_buf->trans = trans; jbd_buf->block = *block; ext4_bcache_inc_ref(block->buf); /* If the content reach the disk, notify us * so that we may do a checkpoint. */ block->buf->end_write = jbd_trans_end_write; block->buf->end_write_arg = jbd_buf; trans->data_cnt++; TAILQ_INSERT_HEAD(&trans->buf_queue, jbd_buf, buf_node); ext4_bcache_set_dirty(block->buf); rec = RB_FIND(jbd_revoke_tree, &trans->revoke_root, &tmp_rec); if (rec) { RB_REMOVE(jbd_revoke_tree, &trans->revoke_root, rec); ext4_free(rec); } return EOK; } /**@brief Add block to be revoked to a transaction * @param trans transaction * @param lba logical block address * @return standard error code*/ int jbd_trans_revoke_block(struct jbd_trans *trans, ext4_fsblk_t lba) { struct jbd_revoke_rec tmp_rec = { .lba = lba }, *rec; rec = RB_FIND(jbd_revoke_tree, &trans->revoke_root, &tmp_rec); if (rec) return EOK; rec = ext4_calloc(1, sizeof(struct jbd_revoke_rec)); if (!rec) return ENOMEM; rec->lba = lba; RB_INSERT(jbd_revoke_tree, &trans->revoke_root, rec); return EOK; } /**@brief Try to add block to be revoked to a transaction. * If @lba still remains in an transaction on checkpoint * queue, add @lba as a revoked block to the transaction. * @param trans transaction * @param lba logical block address * @return standard error code*/ int jbd_trans_try_revoke_block(struct jbd_trans *trans, ext4_fsblk_t lba) { struct jbd_journal *journal = trans->journal; struct jbd_block_rec *block_rec = jbd_trans_block_rec_lookup(journal, lba); if (block_rec) { if (block_rec->trans == trans) { struct jbd_buf *jbd_buf = TAILQ_LAST(&block_rec->dirty_buf_queue, jbd_buf_dirty); /* If there are still unwritten buffers. */ if (TAILQ_FIRST(&block_rec->dirty_buf_queue) != jbd_buf) jbd_trans_revoke_block(trans, lba); } else jbd_trans_revoke_block(trans, lba); } return EOK; } /**@brief Free a transaction * @param journal current journal session * @param trans transaction * @param abort discard all the modifications on the block? * @return standard error code*/ void jbd_journal_free_trans(struct jbd_journal *journal, struct jbd_trans *trans, bool abort) { struct jbd_buf *jbd_buf, *tmp; struct jbd_revoke_rec *rec, *tmp2; struct jbd_block_rec *block_rec, *tmp3; struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs; TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node, tmp) { block_rec = jbd_buf->block_rec; if (abort) { jbd_buf->block.buf->end_write = NULL; jbd_buf->block.buf->end_write_arg = NULL; ext4_bcache_clear_dirty(jbd_buf->block.buf); ext4_block_set(fs->bdev, &jbd_buf->block); } TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue, jbd_buf, dirty_buf_node); jbd_trans_finish_callback(journal, trans, block_rec, abort, false); TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node); ext4_free(jbd_buf); } RB_FOREACH_SAFE(rec, jbd_revoke_tree, &trans->revoke_root, tmp2) { RB_REMOVE(jbd_revoke_tree, &trans->revoke_root, rec); ext4_free(rec); } LIST_FOREACH_SAFE(block_rec, &trans->tbrec_list, tbrec_node, tmp3) { jbd_trans_remove_block_rec(journal, block_rec, trans); } ext4_free(trans); } /**@brief Write commit block for a transaction * @param trans transaction * @return standard error code*/ static int jbd_trans_write_commit_block(struct jbd_trans *trans) { int rc; struct ext4_block block; struct jbd_commit_header *header; uint32_t commit_iblock; struct jbd_journal *journal = trans->journal; commit_iblock = jbd_journal_alloc_block(journal, trans); rc = jbd_block_get_noread(journal->jbd_fs, &block, commit_iblock); if (rc != EOK) return rc; header = (struct jbd_commit_header *)block.data; jbd_set32(&header->header, magic, JBD_MAGIC_NUMBER); jbd_set32(&header->header, blocktype, JBD_COMMIT_BLOCK); jbd_set32(&header->header, sequence, trans->trans_id); if (JBD_HAS_INCOMPAT_FEATURE(&journal->jbd_fs->sb, JBD_FEATURE_COMPAT_CHECKSUM)) { jbd_set32(header, chksum_type, JBD_CRC32_CHKSUM); jbd_set32(header, chksum_size, JBD_CRC32_CHKSUM_SIZE); jbd_set32(header, chksum[0], trans->data_csum); } jbd_commit_csum_set(journal->jbd_fs, header); ext4_bcache_set_dirty(block.buf); ext4_bcache_set_flag(block.buf, BC_TMP); rc = jbd_block_set(journal->jbd_fs, &block); return rc; } /**@brief Write descriptor block for a transaction * @param journal current journal session * @param trans transaction * @return standard error code*/ static int jbd_journal_prepare(struct jbd_journal *journal, struct jbd_trans *trans) { int rc = EOK, i = 0; struct ext4_block desc_block = EXT4_BLOCK_ZERO(), data_block = EXT4_BLOCK_ZERO(); int32_t tag_tbl_size = 0; uint32_t desc_iblock = 0; uint32_t data_iblock = 0; char *tag_start = NULL, *tag_ptr = NULL; struct jbd_buf *jbd_buf, *tmp; struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs; uint32_t checksum = EXT4_CRC32_INIT; struct jbd_bhdr *bhdr = NULL; void *data; /* Try to remove any non-dirty buffers from the tail of * buf_queue. */ TAILQ_FOREACH_REVERSE_SAFE(jbd_buf, &trans->buf_queue, jbd_trans_buf, buf_node, tmp) { struct jbd_revoke_rec tmp_rec = { .lba = jbd_buf->block_rec->lba }; /* We stop the iteration when we find a dirty buffer. */ if (ext4_bcache_test_flag(jbd_buf->block.buf, BC_DIRTY)) break; TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue, jbd_buf, dirty_buf_node); jbd_buf->block.buf->end_write = NULL; jbd_buf->block.buf->end_write_arg = NULL; jbd_trans_finish_callback(journal, trans, jbd_buf->block_rec, true, RB_FIND(jbd_revoke_tree, &trans->revoke_root, &tmp_rec)); jbd_trans_remove_block_rec(journal, jbd_buf->block_rec, trans); trans->data_cnt--; ext4_block_set(fs->bdev, &jbd_buf->block); TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node); ext4_free(jbd_buf); } TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node, tmp) { struct tag_info tag_info; bool uuid_exist = false; bool is_escape = false; struct jbd_revoke_rec tmp_rec = { .lba = jbd_buf->block_rec->lba }; if (!ext4_bcache_test_flag(jbd_buf->block.buf, BC_DIRTY)) { TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue, jbd_buf, dirty_buf_node); jbd_buf->block.buf->end_write = NULL; jbd_buf->block.buf->end_write_arg = NULL; /* The buffer has not been modified, just release * that jbd_buf. */ jbd_trans_finish_callback(journal, trans, jbd_buf->block_rec, true, RB_FIND(jbd_revoke_tree, &trans->revoke_root, &tmp_rec)); jbd_trans_remove_block_rec(journal, jbd_buf->block_rec, trans); trans->data_cnt--; ext4_block_set(fs->bdev, &jbd_buf->block); TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node); ext4_free(jbd_buf); continue; } checksum = jbd_block_csum(journal->jbd_fs, jbd_buf->block.data, checksum, trans->trans_id); if (((struct jbd_bhdr *)jbd_buf->block.data)->magic == to_be32(JBD_MAGIC_NUMBER)) is_escape = true; again: if (!desc_iblock) { desc_iblock = jbd_journal_alloc_block(journal, trans); rc = jbd_block_get_noread(journal->jbd_fs, &desc_block, desc_iblock); if (rc != EOK) break; bhdr = (struct jbd_bhdr *)desc_block.data; jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER); jbd_set32(bhdr, blocktype, JBD_DESCRIPTOR_BLOCK); jbd_set32(bhdr, sequence, trans->trans_id); tag_start = (char *)(bhdr + 1); tag_ptr = tag_start; uuid_exist = true; tag_tbl_size = journal->block_size - sizeof(struct jbd_bhdr); if (jbd_has_csum(&journal->jbd_fs->sb)) tag_tbl_size -= sizeof(struct jbd_block_tail); if (!trans->start_iblock) trans->start_iblock = desc_iblock; ext4_bcache_set_dirty(desc_block.buf); ext4_bcache_set_flag(desc_block.buf, BC_TMP); } tag_info.block = jbd_buf->block.lb_id; tag_info.uuid_exist = uuid_exist; tag_info.is_escape = is_escape; if (i == trans->data_cnt - 1) tag_info.last_tag = true; else tag_info.last_tag = false; tag_info.checksum = checksum; if (uuid_exist) memcpy(tag_info.uuid, journal->jbd_fs->sb.uuid, UUID_SIZE); rc = jbd_write_block_tag(journal->jbd_fs, tag_ptr, tag_tbl_size, &tag_info); if (rc != EOK) { jbd_meta_csum_set(journal->jbd_fs, bhdr); desc_iblock = 0; rc = jbd_block_set(journal->jbd_fs, &desc_block); if (rc != EOK) break; goto again; } data_iblock = jbd_journal_alloc_block(journal, trans); rc = jbd_block_get_noread(journal->jbd_fs, &data_block, data_iblock); if (rc != EOK) { desc_iblock = 0; ext4_bcache_clear_dirty(desc_block.buf); jbd_block_set(journal->jbd_fs, &desc_block); break; } data = data_block.data; memcpy(data, jbd_buf->block.data, journal->block_size); if (is_escape) ((struct jbd_bhdr *)data)->magic = 0; ext4_bcache_set_dirty(data_block.buf); ext4_bcache_set_flag(data_block.buf, BC_TMP); rc = jbd_block_set(journal->jbd_fs, &data_block); if (rc != EOK) { desc_iblock = 0; ext4_bcache_clear_dirty(desc_block.buf); jbd_block_set(journal->jbd_fs, &desc_block); break; } jbd_buf->jbd_lba = data_iblock; tag_ptr += tag_info.tag_bytes; tag_tbl_size -= tag_info.tag_bytes; i++; } if (rc == EOK && desc_iblock) { jbd_meta_csum_set(journal->jbd_fs, (struct jbd_bhdr *)bhdr); trans->data_csum = checksum; rc = jbd_block_set(journal->jbd_fs, &desc_block); } return rc; } /**@brief Write revoke block for a transaction * @param journal current journal session * @param trans transaction * @return standard error code*/ static int jbd_journal_prepare_revoke(struct jbd_journal *journal, struct jbd_trans *trans) { int rc = EOK, i = 0; struct ext4_block desc_block = EXT4_BLOCK_ZERO(); int32_t tag_tbl_size = 0; uint32_t desc_iblock = 0; char *blocks_entry = NULL; struct jbd_revoke_rec *rec, *tmp; struct jbd_revoke_header *header = NULL; int32_t record_len = 4; struct jbd_bhdr *bhdr = NULL; if (JBD_HAS_INCOMPAT_FEATURE(&journal->jbd_fs->sb, JBD_FEATURE_INCOMPAT_64BIT)) record_len = 8; RB_FOREACH_SAFE(rec, jbd_revoke_tree, &trans->revoke_root, tmp) { again: if (!desc_iblock) { desc_iblock = jbd_journal_alloc_block(journal, trans); rc = jbd_block_get_noread(journal->jbd_fs, &desc_block, desc_iblock); if (rc != EOK) break; bhdr = (struct jbd_bhdr *)desc_block.data; jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER); jbd_set32(bhdr, blocktype, JBD_REVOKE_BLOCK); jbd_set32(bhdr, sequence, trans->trans_id); header = (struct jbd_revoke_header *)bhdr; blocks_entry = (char *)(header + 1); tag_tbl_size = journal->block_size - sizeof(struct jbd_revoke_header); if (jbd_has_csum(&journal->jbd_fs->sb)) tag_tbl_size -= sizeof(struct jbd_block_tail); if (!trans->start_iblock) trans->start_iblock = desc_iblock; ext4_bcache_set_dirty(desc_block.buf); ext4_bcache_set_flag(desc_block.buf, BC_TMP); } if (tag_tbl_size < record_len) { jbd_set32(header, count, journal->block_size - tag_tbl_size); jbd_meta_csum_set(journal->jbd_fs, bhdr); bhdr = NULL; desc_iblock = 0; header = NULL; rc = jbd_block_set(journal->jbd_fs, &desc_block); if (rc != EOK) break; goto again; } if (record_len == 8) { uint64_t *blocks = (uint64_t *)blocks_entry; *blocks = to_be64(rec->lba); } else { uint32_t *blocks = (uint32_t *)blocks_entry; *blocks = to_be32((uint32_t)rec->lba); } blocks_entry += record_len; tag_tbl_size -= record_len; i++; } if (rc == EOK && desc_iblock) { if (header != NULL) jbd_set32(header, count, journal->block_size - tag_tbl_size); jbd_meta_csum_set(journal->jbd_fs, bhdr); rc = jbd_block_set(journal->jbd_fs, &desc_block); } return rc; } /**@brief Put references of block descriptors in a transaction. * @param journal current journal session * @param trans transaction*/ void jbd_journal_cp_trans(struct jbd_journal *journal, struct jbd_trans *trans) { struct jbd_buf *jbd_buf, *tmp; struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs; TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node, tmp) { struct ext4_block block = jbd_buf->block; ext4_block_set(fs->bdev, &block); } } /**@brief Update the start block of the journal when * all the contents in a transaction reach the disk.*/ static void jbd_trans_end_write(struct ext4_bcache *bc __unused, struct ext4_buf *buf, int res, void *arg) { struct jbd_buf *jbd_buf = arg; struct jbd_trans *trans = jbd_buf->trans; struct jbd_block_rec *block_rec = jbd_buf->block_rec; struct jbd_journal *journal = trans->journal; bool first_in_queue = trans == TAILQ_FIRST(&journal->cp_queue); if (res != EOK) trans->error = res; TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node); TAILQ_REMOVE(&block_rec->dirty_buf_queue, jbd_buf, dirty_buf_node); jbd_trans_finish_callback(journal, trans, jbd_buf->block_rec, false, false); if (block_rec->trans == trans && buf) { /* Clear the end_write and end_write_arg fields. */ buf->end_write = NULL; buf->end_write_arg = NULL; } ext4_free(jbd_buf); trans->written_cnt++; if (trans->written_cnt == trans->data_cnt) { /* If it is the first transaction on checkpoint queue, * we will shift the start of the journal to the next * transaction, and remove subsequent written * transactions from checkpoint queue until we find * an unwritten one. */ if (first_in_queue) { journal->start = trans->start_iblock + trans->alloc_blocks; wrap(&journal->jbd_fs->sb, journal->start); journal->trans_id = trans->trans_id + 1; TAILQ_REMOVE(&journal->cp_queue, trans, trans_node); jbd_journal_free_trans(journal, trans, false); jbd_journal_purge_cp_trans(journal, false, true); jbd_journal_write_sb(journal); jbd_write_sb(journal->jbd_fs); } } } /**@brief Commit a transaction to the journal immediately. * @param journal current journal session * @param trans transaction * @return standard error code*/ static int __jbd_journal_commit_trans(struct jbd_journal *journal, struct jbd_trans *trans) { int rc = EOK; uint32_t last = journal->last; struct jbd_revoke_rec *rec, *tmp; trans->trans_id = journal->alloc_trans_id; rc = jbd_journal_prepare(journal, trans); if (rc != EOK) goto Finish; rc = jbd_journal_prepare_revoke(journal, trans); if (rc != EOK) goto Finish; if (TAILQ_EMPTY(&trans->buf_queue) && RB_EMPTY(&trans->revoke_root)) { /* Since there are no entries in both buffer list * and revoke entry list, we do not consider trans as * complete transaction and just return EOK.*/ jbd_journal_free_trans(journal, trans, false); goto Finish; } rc = jbd_trans_write_commit_block(trans); if (rc != EOK) goto Finish; journal->alloc_trans_id++; /* Complete the checkpoint of buffers which are revoked. */ RB_FOREACH_SAFE(rec, jbd_revoke_tree, &trans->revoke_root, tmp) { struct jbd_block_rec *block_rec = jbd_trans_block_rec_lookup(journal, rec->lba); struct jbd_buf *jbd_buf = NULL; if (block_rec) jbd_buf = TAILQ_LAST(&block_rec->dirty_buf_queue, jbd_buf_dirty); if (jbd_buf) { struct ext4_buf *buf; struct ext4_block block = EXT4_BLOCK_ZERO(); /* * We do this to reset the ext4_buf::end_write and * ext4_buf::end_write_arg fields so that the checkpoint * callback won't be triggered again. */ buf = ext4_bcache_find_get(journal->jbd_fs->bdev->bc, &block, jbd_buf->block_rec->lba); jbd_trans_end_write(journal->jbd_fs->bdev->bc, buf, EOK, jbd_buf); if (buf) ext4_block_set(journal->jbd_fs->bdev, &block); } } if (TAILQ_EMPTY(&journal->cp_queue)) { /* * This transaction is going to be the first object in the * checkpoint queue. * When the first transaction in checkpoint queue is completely * written to disk, we shift the tail of the log to right. */ if (trans->data_cnt) { journal->start = trans->start_iblock; wrap(&journal->jbd_fs->sb, journal->start); journal->trans_id = trans->trans_id; jbd_journal_write_sb(journal); jbd_write_sb(journal->jbd_fs); TAILQ_INSERT_TAIL(&journal->cp_queue, trans, trans_node); jbd_journal_cp_trans(journal, trans); } else { journal->start = trans->start_iblock + trans->alloc_blocks; wrap(&journal->jbd_fs->sb, journal->start); journal->trans_id = trans->trans_id + 1; jbd_journal_write_sb(journal); jbd_journal_free_trans(journal, trans, false); } } else { /* No need to do anything to the JBD superblock. */ TAILQ_INSERT_TAIL(&journal->cp_queue, trans, trans_node); if (trans->data_cnt) jbd_journal_cp_trans(journal, trans); } Finish: if (rc != EOK && rc != ENOSPC) { journal->last = last; jbd_journal_free_trans(journal, trans, true); } return rc; } /**@brief Allocate a new transaction * @param journal current journal session * @return transaction allocated*/ struct jbd_trans * jbd_journal_new_trans(struct jbd_journal *journal) { struct jbd_trans *trans = NULL; trans = ext4_calloc(1, sizeof(struct jbd_trans)); if (!trans) return NULL; /* We will assign a trans_id to this transaction, * once it has been committed.*/ trans->journal = journal; trans->data_csum = EXT4_CRC32_INIT; trans->error = EOK; TAILQ_INIT(&trans->buf_queue); return trans; } /**@brief Commit a transaction to the journal immediately. * @param journal current journal session * @param trans transaction * @return standard error code*/ int jbd_journal_commit_trans(struct jbd_journal *journal, struct jbd_trans *trans) { int r = EOK; r = __jbd_journal_commit_trans(journal, trans); return r; } /** * @} */