69天探索操作系统-第45天:文件系统日志记录

210 阅读5分钟
pro17.avif

1.介绍

文件系统日志记录是一种用于确保文件系统数据一致性和可靠性的技术。它通过在更改提交到主文件系统之前维护一个日志(即日志文件)来实现。这使得系统能够在发生崩溃或断电时,通过重放或撤销日志中记录的交易来快速恢复。

  • 事务日志: 在应用更改之前,将文件系统的更改记录在日志中。
  • 原子操作: 确保事务要么完全完成,要么完全回滚。
  • 崩溃恢复: 使用日志在崩溃后恢复文件系统到一致状态。
  • 一致性保证: 确保文件系统始终处于有效状态。

image.png

2. 日志结构和组成

一个日志通常被实现为一个磁盘专用区域的循环缓冲区。 它由一系列日志条目组成,每个条目代表对文件系统的更改。

日志结构实现

以下是C语言中实现的基本日志结构:

typedef enum {
    TXN_START,
    TXN_COMMIT,
    TXN_ABORT,
    DATA_WRITE,
    METADATA_UPDATE
} JournalEntryType;

typedef struct {
    uint64_t sequence_number;
    JournalEntryType type;
    uint64_t timestamp;
    uint32_t data_length;
    void *data;
    uint32_t checksum;
} JournalEntry;

typedef struct {
    int fd;                     // Journal file descriptor
    uint64_t current_position;  // Current write position
    uint64_t size;             // Total journal size
    uint64_t wrap_point;       // Position where journal wraps
    pthread_mutex_t lock;
    bool circular;             // Whether journal is circular
} Journal;

Journal* initialize_journal(const char *path, size_t size) {
    Journal *journal = malloc(sizeof(Journal));

    journal->fd = open(path, O_RDWR | O_CREAT, 0644);
    journal->size = size;
    journal->current_position = 0;
    journal->wrap_point = size;
    journal->circular = true;

    pthread_mutex_init(&journal->lock, NULL);

    // Initialize journal header
    JournalHeader header = {
        .magic = JOURNAL_MAGIC,
        .version = JOURNAL_VERSION,
        .creation_time = time(NULL)
    };

    write(journal->fd, &header, sizeof(header));
    fsync(journal->fd);

    return journal;
}

3. 事务管理

在日志文件系统中的事务是一系列被记录并原子地应用的操作。以下是事务管理的实现:

typedef struct {
    uint64_t txn_id;
    time_t start_time;
    List *operations;
    JournalEntry *start_entry;
    bool committed;
} Transaction;

typedef struct {
    Transaction **active_txns;
    size_t txn_count;
    pthread_mutex_t txn_lock;
    Journal *journal;
} TransactionManager;

Transaction* begin_transaction(TransactionManager *tm) {
    Transaction *txn = malloc(sizeof(Transaction));
    txn->txn_id = generate_txn_id();
    txn->start_time = time(NULL);
    txn->operations = list_create();
    txn->committed = false;

    // Create start entry
    JournalEntry *start_entry = create_journal_entry(
        TXN_START,
        txn->txn_id,
        NULL,
        0
    );

    pthread_mutex_lock(&tm->journal->lock);
    write_journal_entry(tm->journal, start_entry);
    pthread_mutex_unlock(&tm->journal->lock);

    txn->start_entry = start_entry;

    // Add to active transactions
    pthread_mutex_lock(&tm->txn_lock);
    tm->active_txns[tm->txn_count++] = txn;
    pthread_mutex_unlock(&tm->txn_lock);

    return txn;
}

bool commit_transaction(TransactionManager *tm, Transaction *txn) {
    // Write all pending operations
    List *ops = txn->operations;
    ListIterator it = list_iterator_create(ops);

    pthread_mutex_lock(&tm->journal->lock);

    while (list_iterator_has_next(it)) {
        JournalEntry *entry = list_iterator_next(it);
        if (!write_journal_entry(tm->journal, entry)) {
            pthread_mutex_unlock(&tm->journal->lock);
            return false;
        }
    }

    // Write commit record
    JournalEntry *commit_entry = create_journal_entry(
        TXN_COMMIT,
        txn->txn_id,
        NULL,
        0
    );

    bool success = write_journal_entry(tm->journal, commit_entry);
    pthread_mutex_unlock(&tm->journal->lock);

    if (success) {
        txn->committed = true;
        remove_active_transaction(tm, txn);
    }

    return success;
}

4. 恢复机制

恢复机制确保文件系统在崩溃后可以恢复到一致状态。以下是崩溃恢复的实现:

typedef struct {
    Journal *journal;
    List *incomplete_txns;
    List *completed_txns;
} RecoveryManager;

bool perform_recovery(RecoveryManager *rm) {
    // Scan journal from last checkpoint
    uint64_t checkpoint_position = find_last_checkpoint(rm->journal);
    set_journal_position(rm->journal, checkpoint_position);

    while (has_more_entries(rm->journal)) {
        JournalEntry *entry = read_next_entry(rm->journal);

        switch (entry->type) {
            case TXN_START:
                add_incomplete_transaction(rm, entry->txn_id);
                break;

            case TXN_COMMIT:
                move_to_completed(rm, entry->txn_id);
                break;

            case TXN_ABORT:
                remove_incomplete_transaction(rm, entry->txn_id);
                break;

            case DATA_WRITE:
            case METADATA_UPDATE:
                if (is_transaction_complete(rm, entry->txn_id)) {
                    replay_operation(entry);
                }
                break;
        }
    }

    // Rollback incomplete transactions
    return rollback_incomplete_transactions(rm);
}

bool replay_operation(JournalEntry *entry) {
    switch (entry->type) {
        case DATA_WRITE:
            return replay_data_write(entry);

        case METADATA_UPDATE:
            return replay_metadata_update(entry);

        default:
            return false;
    }
}

5.检查点处理

检查点用于减少恢复过程中需要扫描的日志数据量。以下是检查点处理的一个实现:

typedef struct {
    uint64_t checkpoint_id;
    time_t timestamp;
    List *active_txns;
    uint64_t journal_position;
} Checkpoint;

bool create_checkpoint(Journal *journal, TransactionManager *tm) {
    pthread_mutex_lock(&tm->txn_lock);
    pthread_mutex_lock(&journal->lock);

    Checkpoint *cp = malloc(sizeof(Checkpoint));
    cp->checkpoint_id = generate_checkpoint_id();
    cp->timestamp = time(NULL);
    cp->journal_position = journal->current_position;

    // Copy active transactions
    cp->active_txns = list_create();
    for (size_t i = 0; i < tm->txn_count; i++) {
        list_append(cp->active_txns, tm->active_txns[i]);
    }

    // Write checkpoint record
    JournalEntry *cp_entry = create_checkpoint_entry(cp);
    bool success = write_journal_entry(journal, cp_entry);

    if (success) {
        fsync(journal->fd);
        update_checkpoint_pointer(journal, cp);
    }

    pthread_mutex_unlock(&journal->lock);
    pthread_mutex_unlock(&tm->txn_lock);

    return success;
}

6.预写日志

预写日志(WAL)确保在将更改应用到文件系统之前进行日志记录。以下是WAL的实现:

typedef struct {
    void *data;
    size_t length;
    uint64_t disk_offset;
    uint32_t checksum;
} WALRecord;

bool write_ahead_log(Journal *journal, WALRecord *record) {
    // Create log entry
    JournalEntry *entry = create_journal_entry(
        DATA_WRITE,
        current_transaction_id(),
        record->data,
        record->length
    );

    // Write to journal first
    pthread_mutex_lock(&journal->lock);
    bool logged = write_journal_entry(journal, entry);
    pthread_mutex_unlock(&journal->lock);

    if (!logged) {
        return false;
    }

    // Now safe to write to actual location
    return write_to_disk(record->disk_offset,
                        record->data,
                        record->length);
}

7.日志空间管理

日志空间管理确保日志不会耗尽空间。以下是日志空间管理的一个实现:

typedef struct {
    uint64_t total_space;
    uint64_t used_space;
    uint64_t reserved_space;
    List *free_segments;
} JournalSpaceManager;

bool allocate_journal_space(JournalSpaceManager *manager,
                          size_t required_space) {
    pthread_mutex_lock(&manager->lock);

    if (manager->used_space + required_space >
        manager->total_space - manager->reserved_space) {
        // Try to reclaim space
        if (!reclaim_journal_space(manager, required_space)) {
            pthread_mutex_unlock(&manager->lock);
            return false;
        }
    }

    manager->used_space += required_space;
    pthread_mutex_unlock(&manager->lock);
    return true;
}

bool reclaim_journal_space(JournalSpaceManager *manager,
                         size_t required_space) {
    // Find completed transactions that can be truncated
    List *completed_txns = find_completed_transactions(manager);
    uint64_t reclaimable_space = 0;

    ListIterator it = list_iterator_create(completed_txns);
    while (list_iterator_has_next(it)) {
        Transaction *txn = list_iterator_next(it);
        reclaimable_space += calculate_transaction_space(txn);

        if (reclaimable_space >= required_space) {
            truncate_journal(manager, txn->start_entry->position);
            return true;
        }
    }

    return false;
}

8.总结

文件系统日志记录对于保持数据一致性和从系统崩溃中恢复至关重要。了解日志记录机制的实现细节对于开发可靠的文件系统至关重要。