From: David Howells The attached patch adds a general filesystem cache. This takes the form of a filesystem so that it can store the cache on a block device directly rather than going through another disc filesystem. The reasons for this include greater performance and ease of maintanence of metadata and data consistency. See the documentation in patch 3/6 for a more thorough explanation. Signed-Off-By: David Howells Signed-off-by: Steve Dickson Signed-off-by: Andrew Morton --- fs/Kconfig | 30 fs/Makefile | 1 fs/cachefs/Makefile | 26 fs/cachefs/block.c | 708 +++++++++++++++++ fs/cachefs/cachefs-int.h | 687 +++++++++++++++++ fs/cachefs/cachefs-layout.h | 503 ++++++++++++ fs/cachefs/index.c | 970 ++++++++++++++++++++++++ fs/cachefs/indirection-io.c | 833 ++++++++++++++++++++ fs/cachefs/inode.c | 400 ++++++++++ fs/cachefs/interface.c | 1473 ++++++++++++++++++++++++++++++++++++ fs/cachefs/journal.c | 1748 +++++++++++++++++++++++++++++++++++++++++++ fs/cachefs/kcachefsd.c | 164 ++++ fs/cachefs/linear-io.c | 222 +++++ fs/cachefs/main.c | 142 +++ fs/cachefs/misc.c | 296 +++++++ fs/cachefs/nowrite.c | 133 +++ fs/cachefs/recycling.c | 1090 +++++++++++++++++++++++++++ fs/cachefs/replay.c | 1753 ++++++++++++++++++++++++++++++++++++++++++++ fs/cachefs/rootdir.c | 778 +++++++++++++++++++ fs/cachefs/status.c | 217 +++++ fs/cachefs/super.c | 938 +++++++++++++++++++++++ fs/cachefs/vjournal.c | 656 ++++++++++++++++ include/linux/cachefs.h | 351 ++++++++ 23 files changed, 14119 insertions(+) diff -puN /dev/null fs/cachefs/block.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25-akpm/fs/cachefs/block.c 2005-06-26 13:42:55.000000000 -0700 @@ -0,0 +1,708 @@ +/* block.c: metadata block management + * + * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include "cachefs-int.h" + +kmem_cache_t *cachefs_block_jar; + +void cachefs_block_init_once(void *_block, kmem_cache_t *cachep, + unsigned long flags) +{ + struct cachefs_block *block = _block; + + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + memset(block, 0, sizeof(*block)); + + rwlock_init(&block->ref_lock); + init_waitqueue_head(&block->writewq); + INIT_LIST_HEAD(&block->batch_link); + } +} + +/*****************************************************************************/ +/* + * initialise the block with zeros + */ +static int cachefs_block_dummy_filler(void *data, struct page *page) +{ + struct cachefs_page *pageio; + + _enter("%p,{%lu}", data, page->index); + + /* we need somewhere to note journal ACKs that need to be made */ + pageio = cachefs_page_get_private(page, GFP_KERNEL); + if (IS_ERR(pageio)) + return PTR_ERR(pageio); + + pageio->mapped_block = data; + cachefs_block_get(pageio->mapped_block); + + memclear_highpage_flush(page, 0, PAGE_SIZE); + + SetPageUptodate(page); + unlock_page(page); + return 0; + +} /* end cachefs_block_dummy_filler() */ + +/*****************************************************************************/ +/* + * associate a page with a block, dislodging any old page association + */ +int cachefs_block_set(struct cachefs_super *super, + struct cachefs_block *block, + struct page *page, + struct cachefs_page *pageio) +{ + DECLARE_WAITQUEUE(myself,current); + + struct cachefs_block *block2; + + _enter(",%u,", block->bix); + + /* don't do anything if already associated as we want */ + block2 = pageio->mapped_block; + if (block2) { + if (block2 == block) { + if (block->page == page) { + _leave(" = 0 [assoc preset]"); + return 0; + } + + block->page = page; + _leave(" = 0 [assoc xchg]"); + return 0; + } + + BUG(); /* page already associated with a different block! */ + } + + /* get the page alloc lock for this block */ + if (test_and_set_bit(CACHEFS_BLOCK_ALLOC, &block->flags)) { + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&block->writewq, &myself); + + while (test_and_set_bit(CACHEFS_BLOCK_ALLOC, &block->flags)) { + if (signal_pending(current)) + break; + + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&block->writewq, &myself); + + if (signal_pending(current)) + goto intr; + } + + /* make the association */ + pageio->mapped_block = cachefs_block_get(block); + + clear_bit(CACHEFS_BLOCK_COW,&block->flags); + block->page = page; + + clear_bit(CACHEFS_BLOCK_ALLOC,&block->flags); + wake_up_all(&block->writewq); + + _leave(" = 0 [assoc set]"); + return 0; + + intr: + _leave(" = -EINTR"); + return -EINTR; + +} /* end cachefs_block_set() */ + +/*****************************************************************************/ +/* + * associate a page with a block, dislodging any old page association + */ +int cachefs_block_set2(struct cachefs_super *super, + cachefs_blockix_t bix, + struct page *page, + struct cachefs_page *pageio, + struct cachefs_block **_block) +{ + struct cachefs_block *block; + int ret; + + _enter(",%u,,",bix); + + if (_block) + *_block = NULL; + + /* get the block definition */ + block = cachefs_block_insert(super, bix); + if (IS_ERR(block)) { + ret = PTR_ERR(block); + goto error; + } + + /* associate the block with the page */ + ret = cachefs_block_set(super, block, page, pageio); + if (ret < 0) + goto error2; + + /* we return the block to the caller with an extra ref held if + * they ask for it */ + if (_block) { + *_block = block; + goto error; + } + + error2: + cachefs_block_put(block); + error: + _leave(" = %d", ret); + return ret; + +} /* end cachefs_block_set2() */ + +/*****************************************************************************/ +/* + * read a metadata block from disc or initialise it + */ +int cachefs_block_read(struct cachefs_super *super, + struct cachefs_inode *inode, + cachefs_blockix_t bix, + int wipe, + struct cachefs_block **_block, + struct page **_page) +{ + struct address_space *mapping; + struct cachefs_block *block; + struct page *page; + filler_t *filler; + + DECLARE_WAITQUEUE(myself, current); + + _enter(",%lx,%u,%d,,", + inode ? inode->vfs_inode.i_ino : CACHEFS_INO_MISC, bix, wipe); + + if (_block) + *_block = NULL; + if (_page) + *_page = NULL; + + /* get the block definition */ + block = cachefs_block_insert(super, bix); + if (IS_ERR(block)) { + _leave(" = %ld [bi]", PTR_ERR(block)); + return PTR_ERR(block); + } + + /* get the page alloc lock for this block */ + if (test_and_set_bit(CACHEFS_BLOCK_ALLOC, &block->flags)) { + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&block->writewq, &myself); + + while (test_and_set_bit(CACHEFS_BLOCK_ALLOC, &block->flags)) { + if (signal_pending(current)) + break; + + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&block->writewq, &myself); + + if (signal_pending(current)) + goto intr; + } + + /* get a page for it if it doesn't already exist */ + if (!block->page) { + /* if the block is marked as currently undergoing writeback + * then there must have been an ENOMEM encountered whilst + * trying to COW the block */ + if (test_bit(CACHEFS_BLOCK_WRITEBACK, &block->flags)) { + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&block->writewq, &myself); + + while (test_bit(CACHEFS_BLOCK_WRITEBACK, + &block->flags)) { + if (signal_pending(current)) + break; + + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&block->writewq, &myself); + + if (signal_pending(current)) + goto intr2; + } + + /* load the page into the page cache */ + if (inode) + mapping = inode->vfs_inode.i_mapping; + else + mapping = super->imisc->i_mapping; + + filler = (filler_t *) mapping->a_ops->readpage; + if (wipe) + filler = cachefs_block_dummy_filler; + + page = read_cache_page(mapping, bix, filler, block); + + if (IS_ERR(page)) { + cachefs_block_put(block); + _leave(" = %ld [rcp]", PTR_ERR(page)); + return PTR_ERR(page); + } + + block->page = page; + } + else { + page = block->page; + get_page(page); + } + + clear_bit(CACHEFS_BLOCK_ALLOC, &block->flags); + wake_up_all(&block->writewq); + + if (_block) { + *_block = block; + } + else { + cachefs_block_put(block); + block = NULL; + } + + if (_page) { + *_page = page; + } + else { + dbgpgfree(page); + page_cache_release(page); + } + + _leave(" = 0"); + return 0; + + intr2: + clear_bit(CACHEFS_BLOCK_ALLOC, &block->flags); + wake_up_all(&block->writewq); + intr: + cachefs_block_put(block); + _leave(" = -EINTR"); + return -EINTR; + +} /* end cachefs_block_read() */ + +/*****************************************************************************/ +/* + * copy a block upon attempting to modify it and finding that it's busy being + * written out + */ +int cachefs_block_cow(struct cachefs_super *super, struct cachefs_block *block) +{ + DECLARE_WAITQUEUE(myself, current); + +#ifndef CACHEFS_BLOCK_USE_COW + + _enter(",{%u}", block->bix); + + /* if COW is not permitted, then simply wait for the page to finish + * being written back */ + if (test_bit(CACHEFS_BLOCK_COW, &block->flags)) { + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&block->writewq, &myself); + + while (test_bit(CACHEFS_BLOCK_COW, &block->flags)) { + if (signal_pending(current)) + break; + + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&block->writewq, &myself); + } + + _leave(" = 0"); + return 0; + +#else + /* experimental page copy-on-write; may not work */ + struct address_space *mapping; + struct page *page, *newpage; + filler_t filler; + int ret; + + _enter(",%u", block->bix); + + /* get the page alloc lock for this block */ + if (test_and_set_bit(CACHEFS_BLOCK_ALLOC, &block->flags)) { + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&block->writewq, &myself); + + while (test_and_set_bit(CACHEFS_BLOCK_ALLOC, &block->flags)) { + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&block->writewq, &myself); + } + + /* duplicate the page if it's flagged copy-on-write */ + if (test_bit(CACHEFS_BLOCK_COW, &block->flags)) { + struct cachefs_page *newpageio; + + mapping = super->imisc->i_mapping; + + ret = -ENOMEM; + newpage = page_cache_alloc_cold(mapping); + if (!newpage) + goto error; + + if (cachefs_page_get_private(newpage, &newpageio, + mapping_gfp_mask(mapping)) < 0) + goto error_page; + + newpageio->mapped_block = + cachefs_block_get( + __cachefs_get_page_block(block->page)); + + copy_highpage(newpage, block->page); + + /* exchange the old page for the new page */ + page = xchg(&block->page, NULL); + + mapping->a_ops->releasepage(page, GFP_NOFS); + remove_from_page_cache(page); + page_cache_release(page); + page = NULL; + + ret = add_to_page_cache_lru(newpage, mapping, block->bix, + mapping_gfp_mask(mapping)); + if (ret < 0) { + BUG_ON(ret == -EEXIST); + goto error_page; + } + + block->page = newpage; + } + else { + page = block->page; + get_page(page); + } + + clear_bit(CACHEFS_BLOCK_ALLOC, &block->flags); + wake_up_all(&block->writewq); + + _leave(" = 0"); + return 0; + + error_page: + page_cache_release(newpage); + error: + clear_bit(CACHEFS_BLOCK_ALLOC, &block->flags); + wake_up_all(&block->writewq); + + _leave(" = %d", ret); + return ret; +#endif + +} /* end cachefs_block_cow() */ + +/*****************************************************************************/ +/* + * indicate that we're going to modify a block + * - the page pointed to by *_page may be COW'd and replaced with a different + * page + */ +void cachefs_block_modify(struct cachefs_super *super, + struct cachefs_block *block, + struct page **_page) +{ + struct page *page; + + _enter(",%u,", block->bix); + + if (*_page != block->page) { + page = block->page; + get_page(page); + cachefs_put_page(xchg(_page, page)); + } + + BUG_ON(!*_page); + + _leave(""); + +} /* end cachefs_block_modify() */ + +/*****************************************************************************/ +/* + * insert a block into the superblock's lookup tree (if it doesn't already + * exist) + */ +struct cachefs_block *cachefs_block_insert(struct cachefs_super *super, + cachefs_blockix_t bix) +{ + struct cachefs_block *newblock, *block; + struct rb_node *parent, **p; + unsigned long flags; + + _enter(",%u", bix); + + if (bix > i_size_read(super->sb->s_bdev->bd_inode) / PAGE_SIZE) { + printk("CacheFS: trying to insert out of range block %x/%lx\n", + bix, + (unsigned long) + (i_size_read(super->sb->s_bdev->bd_inode) >> PAGE_SHIFT) + ); + BUG(); + } + + /* allocate and initialise a block record just in case */ + newblock = kmem_cache_alloc(cachefs_block_jar, SLAB_KERNEL); + if (!newblock) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + atomic_set(&newblock->usage,1); + newblock->flags = 0; + newblock->bix = bix; + newblock->super = super; + newblock->page = NULL; + newblock->writeback = NULL; + newblock->ref = NULL; + + parent = NULL; + block = NULL; + + /* see if the block is already recorded */ + write_lock_irqsave(&super->blk_tree_lock, flags); + p = &super->blk_tree.rb_node; + + while (*p) { + parent = *p; + block = rb_entry(parent, struct cachefs_block, lookup_node); + + if (bix < block->bix) + p = &(*p)->rb_left; + else if (bix > block->bix) + p = &(*p)->rb_right; + else + goto block_already_present; + } + + /* there's no block record yet - use the new one we allocated + * earlier */ + rb_link_node(&newblock->lookup_node, parent, p); + rb_insert_color(&newblock->lookup_node, &super->blk_tree); + write_unlock_irqrestore(&super->blk_tree_lock, flags); + + atomic_inc(&super->cnt_blk_tree); + _leave(" = %p {u=%d} [new]", newblock, atomic_read(&newblock->usage)); + return newblock; + + /* the block is already recorded, pin that one and dispose of + * the new one */ + block_already_present: + cachefs_block_get(block); + write_unlock_irqrestore(&super->blk_tree_lock, flags); + + dbgfree(newblock); + kmem_cache_free(cachefs_block_jar, newblock); + + _leave(" = %p {u=%d}", block, atomic_read(&block->usage)); + return block; + +} /* end cachefs_block_insert() */ + +/*****************************************************************************/ +/* + * find a block in the superblock's lookup tree + */ +struct cachefs_block *cachefs_block_find(struct cachefs_super *super, + cachefs_blockix_t bix) +{ + struct cachefs_block *block; + struct rb_node *node; + unsigned long flags; + + _enter(",%d", bix); + + /* do the lookup */ + read_lock_irqsave(&super->blk_tree_lock, flags); + node = super->blk_tree.rb_node; + + while (node) { + block = rb_entry(node, struct cachefs_block, lookup_node); + + if (bix < block->bix) + node = node->rb_left; + else if (bix > block->bix) + node = node->rb_right; + else + goto block_found; + } + read_unlock_irqrestore(&super->blk_tree_lock, flags); + + /* not found */ + _leave(" = -ENOENT"); + return ERR_PTR(-ENOENT); + + /* found - pin and return */ +block_found: + cachefs_block_get(block); + read_unlock_irqrestore(&super->blk_tree_lock, flags); + + _leave(" = %p{u=%d}", block, atomic_read(&block->usage)); + return block; + +} /* end cachefs_block_find() */ + +/*****************************************************************************/ +/* + * dispose of a block record + */ +void __cachefs_block_put(struct cachefs_block *block) +{ + struct cachefs_super *super = block->super; + unsigned long flags; + + _enter(",{u=%d bix=%d}", atomic_read(&block->usage), block->bix); + + /* see if we can remove from the superblock's lookup tree */ + write_lock_irqsave(&super->blk_tree_lock, flags); + + if (atomic_read(&block->usage) == 0) + rb_erase(&block->lookup_node, &super->blk_tree); + else + block = NULL; + + write_unlock_irqrestore(&super->blk_tree_lock, flags); + + /* destroy if now completely unused */ + if (block) { + atomic_dec(&super->cnt_blk_tree); + dbgfree(block); + kmem_cache_free(cachefs_block_jar, block); + } + + _leave(""); + +} /* end __cachefs_block_put() */ + +/*****************************************************************************/ +/* + * withdraw from active service all the blocks residing on a device + */ +void cachefs_block_withdraw(struct cachefs_super *super) +{ + struct cachefs_block *block, *xblock; + struct cachefs_page *pageio; + struct rb_node *node; + unsigned long flags; + + DECLARE_WAITQUEUE(myself, current); + + _enter(""); + + /* first thing to do is mark all blocks withdrawn + * - this prevents the netfs from getting underfoot + */ + read_lock_irqsave(&super->blk_tree_lock, flags); + + for (node = rb_first(&super->blk_tree); node; node = rb_next(node)) { + block = rb_entry(node, struct cachefs_block, lookup_node); + set_bit(CACHEFS_BLOCK_WITHDRAWN, &block->flags); + } + + read_unlock_irqrestore(&super->blk_tree_lock, flags); + + /* now withdraw each block that's already in use by a netfs */ + for (;;) { + block = NULL; + + /* find the next one in the tree */ + write_lock_irqsave(&super->blk_tree_lock, flags); + + for (node = rb_first(&super->blk_tree); + node; + node = rb_next(node)) { + block = rb_entry(node, struct cachefs_block, + lookup_node); + if (block->ref) { + cachefs_block_get(block); + break; + } + } + + write_unlock_irqrestore(&super->blk_tree_lock, flags); + + if (!node) + break; + + _debug("withdraw block %u", block->bix); + + /* disconnect the block from the occupying netfs's + * page mapping cookie */ + xblock = NULL; + write_lock(&block->ref_lock); + + pageio = block->ref; + if (pageio) { + BUG_ON(pageio->mapped_block != block); + + write_lock(&pageio->lock); + xblock = pageio->mapped_block; + pageio->mapped_block = NULL; + block->ref = NULL; + write_unlock(&block->ref_lock); + } + + write_unlock(&pageio->lock); + cachefs_block_put(xblock); + + /* wait for the netfs to finish with the block */ + if (test_bit(CACHEFS_BLOCK_NETFSBUSY, &block->flags)) { + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&block->writewq, &myself); + + while (test_bit(CACHEFS_BLOCK_NETFSBUSY, + &block->flags)) { + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&block->writewq, &myself); + } + + /* a block that's not yet achieved validity must be + * cancelled to avoid bad data later */ + cachefs_vj_cancel(block); + + cachefs_block_put(block); + } + + _leave(""); + +} /* end cachefs_block_withdraw() */ diff -puN /dev/null fs/cachefs/cachefs-int.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25-akpm/fs/cachefs/cachefs-int.h 2005-06-26 13:42:55.000000000 -0700 @@ -0,0 +1,687 @@ +/* cachefs-int.h: general filesystem caching internal defs + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _LINUX_CACHEFS_INT_H +#define _LINUX_CACHEFS_INT_H + +#include +#include +#include +#include "cachefs-layout.h" + +/* set to true to use COW buffering during batched writes rather than simply suspending any process + * that wants to modify a metadata page undergoing writeback */ +#undef CACHEFS_BLOCK_USE_COW + +#define CACHEFS_BATCH_WRITE_TIMER 5 /* time in seconds to next batch write */ + +extern int cachefs_debug; + +struct cachefs_super; +struct cachefs_block; +struct cachefs_inode; +struct cachefs_search_result; +struct cachefs_transaction; + +extern struct address_space_operations cachefs_indr_io_addrspace_operations; +extern struct address_space_operations cachefs_linear_io_addrspace_operations; +extern struct file_operations cachefs_root_file_operations; +extern struct inode_operations cachefs_root_inode_operations; +extern struct rw_semaphore cachefs_addremove_sem; +extern struct list_head cachefs_cache_list; +extern struct list_head cachefs_netfs_list; + +extern int cachefs_fs_init(void); +extern void cachefs_fs_exit(void); +extern int kcachefsd(void *_super); + +extern int cachefs_io_dummy_filler(void *data, struct page *page); + +extern int cachefs_indr_io_get_block(struct inode *inode, struct page *page, + struct cachefs_page *pageio, int create); + +struct cachefs_reclaimable { + unsigned ino; + time_t atime; +}; + +/*****************************************************************************/ +/* + * cachefs superblock private information + */ +struct cachefs_super +{ + struct super_block *sb; + struct list_head mnt_link; /* link in list of mounted caches */ + struct cachefs_inode *imetadata; /* the metadata records file */ + struct inode *imisc; /* an inode covering the whole blkdev */ + + unsigned long flags; +#define CACHEFS_SUPER_INIT_BLKDEV 0 /* T if initialising blockdev */ +#define CACHEFS_SUPER_BATCH_TIMER 1 /* T if batch timer expired */ +#define CACHEFS_SUPER_DO_RECLAIM 2 /* T if should do reclamation */ +#define CACHEFS_SUPER_RCM_IMM_SCAN 3 /* T if should scan for immediately + * reclaimable inodes */ +#define CACHEFS_SUPER_WITHDRAWN 4 /* T if cache has been withdrawn */ +#define CACHEFS_SUPER_REPLAYING_UJNL 5 /* T if replaying u-journal */ + + int bio_wr_barrier; /* command to submit a write barrier BIO */ + + /* index management */ + struct list_head ino_list; /* list of data/index inodes */ + spinlock_t ino_list_lock; + + /* block allocation and recycling management */ + struct rb_root blk_tree; /* block mapping tree */ + rwlock_t blk_tree_lock; + + cachefs_blockix_t alloc_cur; /* current free block alloc stack */ + unsigned alloc_cur_n; /* current occupancy of alloc stack */ + unsigned short alloc_leaf; /* next leaf to allocate */ + struct cachefs_block *alloc_block; /* current node in allocation stack */ + struct page *alloc_node; /* current node in allocation stack */ + struct cachefs_block *alloc_nxblock; /* next node in allocation tree */ + struct page *alloc_next; /* next node in allocation tree */ + struct semaphore alloc_sem; /* allocation semaphore */ + wait_queue_head_t alloc_wq; /* processes waiting for allocation */ + + struct cachefs_block *recycle_block; /* current node in recycle stack */ + struct page *recycle_node; /* current node being recycled to */ + unsigned recycle_room; /* room remaining in front recycle node */ + cachefs_blockix_t recycle_cur; /* current node in recycle stack */ + unsigned recycle_cur_n; /* current occupancy of reserve stack */ + + /* inode reclamation */ + spinlock_t rcm_lock; + + unsigned *rcm_imm_buf; /* circular immediate-reclaim buffer */ + unsigned short rcm_imm_head; + unsigned short rcm_imm_tail; + +#define CACHEFS_RCM_IMM_BUFSIZE (PAGE_SIZE/sizeof(unsigned)) + + struct cachefs_reclaimable *rcm_atm_list; /* atime-based reclaimable inode list */ + unsigned short rcm_atm_end; /* end of buffer contents */ + +#define CACHEFS_RCM_ATM_LISTSIZE (PAGE_SIZE/sizeof(struct cachefs_reclaimable)) + + unsigned rcm_ino; /* inode being reclaimed */ + unsigned rcm_indirect; /* current indirect block index */ + cachefs_blockix_t rcm_block; /* current block being recycled */ + unsigned short rcm_ptrnext; /* next entry in rcyblock to process */ + unsigned short rcm_ptrstop; /* entry in rcyblock to stop at */ + + struct cachefs_inode *rcm_inode; /* inode being reclaimed */ + struct page *rcm_curpage; /* page holding rcm_block */ + + /* update journal tracking */ + unsigned short ujnl_step; /* journal block size */ + unsigned short ujnl_head; /* next journal block to alloc */ + unsigned short ujnl_tail; /* next journal block to ACK */ + wait_queue_head_t ujnl_sync_wq; /* journal sync waitqueue */ + + struct semaphore ujnl_alloc_sem; + wait_queue_head_t ujnl_alloc_wq; + + unsigned ujnl_jsof; /* u-journal start sector */ + int16_t ujnl_batch; /* next batch to be written */ + uint16_t ujnl_serial; /* next serial to use in batch */ + spinlock_t ujnl_mk_lock; + struct list_head ujnl_markq; /* marked transactions */ + struct list_head ujnl_commitq; /* committed transactions */ + struct list_head ujnl_writeq; /* transactions being written */ + struct list_head ujnl_replayq; /* blocks having allocation replayed */ + + struct cachefs_alteration *njalt_markq; /* unjournalled alterations - marked */ + struct cachefs_alteration *njalt_writeq; /* unjournalled alterations - writing */ + spinlock_t njalt_lock; + + struct semaphore batch_sem; /* batching mutex */ + struct semaphore batch_uj_sem; /* ujnl written sync mutex */ + struct rw_semaphore batch_ctrl_sem; /* marking/batching interleave control */ + spinlock_t batch_qlock; + struct list_head batch_writeq; /* blocks awaiting writing */ + struct list_head batch_doneq; /* blocks written */ + struct list_head batch_errorq; /* blocks that got write error */ + wait_queue_head_t batch_done_wq; /* blocks write complete wait queue */ + struct timer_list batch_timer; /* time to next batch write */ + wait_queue_head_t batch_timer_wq; /* batch timer wait queue */ + wait_queue_head_t batch_sync_wq; /* batch sync wait queue */ + + /* validity journal tracking */ + unsigned long *vjnl_map; /* bitmap of free entries (1 page) */ + unsigned vjnl_count; /* number of free entries */ + spinlock_t vjnl_lock; /* allocation lock */ + wait_queue_head_t vjnl_alloc_wq; /* allocation queue */ + struct list_head vjnl_unallocq; /* entries requiring unallocation */ + struct list_head vjnl_writtenq; /* entries requiring clearing */ + + /* writeback journal tracking */ + unsigned long *wbj_map; /* bitmap of free entries (1 page) */ + unsigned wbj_count; /* number of free entries */ + spinlock_t wbj_lock; /* allocation lock */ + wait_queue_head_t wbj_alloc_wq; /* allocation queue */ + + /* cache management daemon for this fs */ + task_t *dmn_task; /* cache daemon task */ + struct completion dmn_alive; /* completion of initialisation */ + struct completion dmn_dead; /* completion of death */ + wait_queue_head_t dmn_sleepq; /* general sleep queue */ + int dmn_die; /* request to die */ + + /* event counting */ + atomic_t cnt_blk_tree; /* number of outstanding blk_tree nodes */ + atomic_t cnt_ujnl_mkrq; /* number of marks requested */ + atomic_t cnt_ujnl_mkgr; /* number of marks granted */ + atomic_t cnt_ujnl_mkwr; /* number of marks written */ + atomic_t cnt_ujnl_akrq; /* number of ACKs requested */ + atomic_t cnt_ujnl_akgr; /* number of ACKs granted */ + atomic_t cnt_ujnl_akwr; /* number of ACKs written */ + atomic_t cnt_ujnl_free; /* number of marks freed */ + + /* superblock copy */ + struct cachefs_ondisc_superblock *layout; +}; + +extern void cachefs_add_cache(struct cachefs_super *super, + struct cachefs_search_result *srch); +extern void cachefs_withdraw_cache(struct cachefs_super *super); + +extern void cachefs_recycle_unready_blocks(struct cachefs_super *super); +extern void cachefs_recycle_transfer_stack(struct cachefs_super *super); +extern void cachefs_recycle_reclaim(struct cachefs_super *super); +extern void cachefs_recycle_unallocate_data_block(struct cachefs_super *super); + +extern int cachefs_ujnl_check_barrier_cap(struct cachefs_super *super); + +/*****************************************************************************/ +/* + * block management record + */ +struct cachefs_block +{ + struct rb_node lookup_node; /* node in superblock's lookup tree */ + struct cachefs_super *super; /* superblock on which block resides */ + cachefs_blockix_t bix; /* index of block on disc */ + atomic_t usage; /* usage count */ + wait_queue_head_t writewq; /* write completion sleep queue */ + unsigned long flags; +#define CACHEFS_BLOCK_ALLOC 0 /* [bit] page allocation lock */ +#define CACHEFS_BLOCK_WRITEBACK 1 /* [bit] block undergoing writeback */ +#define CACHEFS_BLOCK_COW 2 /* [bit] page must be copied before modification */ +#define CACHEFS_BLOCK_NOCOW 3 /* [bit] page mustn't be COW'ed */ +#define CACHEFS_BLOCK_ERROR 4 /* [bit] block has disc error */ +#define CACHEFS_BLOCK_UJOURNAL 5 /* [bit] block holds update journal entries */ +#define CACHEFS_BLOCK_CRITICAL 6 /* [bit] block holds critical data that mustn't be + * zapped until u-journal sync'd */ +#define CACHEFS_BLOCK_WITHDRAWN 7 /* [bit] backing cache withdrawn from service */ +#define CACHEFS_BLOCK_NETFSDATA 8 /* [bit] netfs data block (discard metadata) */ +#define CACHEFS_BLOCK_NETFSBUSY 9 /* [bit] netfs is accessing the block */ +#define CACHEFS_BLOCK_ALTERED 10 /* [bit] unjournalled alteration made */ + +#define _CACHEFS_BLOCK_ALLOC (1 << CACHEFS_BLOCK_ALLOC) +#define _CACHEFS_BLOCK_COW (1 << CACHEFS_BLOCK_COW) +#define _CACHEFS_BLOCK_WRITEBACK (1 << CACHEFS_BLOCK_WRITEBACK) +#define _CACHEFS_BLOCK_UJOURNAL (1 << CACHEFS_BLOCK_UJOURNAL) + + struct list_head batch_link; /* link in batch writer's list */ + struct page *page; /* current data for this block */ + struct page *writeback; /* source of writeback for this block */ + struct cachefs_page *ref; /* netfs's ref to this page */ + rwlock_t ref_lock; /* lock governing ref pointer */ + struct cachefs_vj_entry *vjentry; /* invalid block record */ +}; + +extern kmem_cache_t *cachefs_block_jar; + +extern void cachefs_block_init_once(void *_block, kmem_cache_t *cachep, + unsigned long flags); + +extern struct cachefs_block *cachefs_block_insert(struct cachefs_super *super, + cachefs_blockix_t bix); + +extern struct cachefs_block * cachefs_block_find(struct cachefs_super *super, + cachefs_blockix_t bix); + +extern int cachefs_block_set(struct cachefs_super *super, + struct cachefs_block *block, + struct page *page, + struct cachefs_page *pageio); + +extern int cachefs_block_set2(struct cachefs_super *super, + cachefs_blockix_t bix, + struct page *page, + struct cachefs_page *pageio, + struct cachefs_block **_block); + +extern int cachefs_block_read(struct cachefs_super *super, + struct cachefs_inode *inode, + cachefs_blockix_t bix, + int wipe, + struct cachefs_block **_block, + struct page **_page); + +extern void cachefs_block_modify(struct cachefs_super *super, + struct cachefs_block *block, + struct page **_page); + +extern int cachefs_block_cow(struct cachefs_super *super, + struct cachefs_block *block); + +extern int cachefs_block_begin_alter(struct cachefs_block *block); +extern void cachefs_block_end_alter(struct cachefs_block *block); + +static inline +struct cachefs_block *cachefs_block_get(struct cachefs_block *block) +{ + atomic_inc(&block->usage); + return block; +} + +extern void __cachefs_block_put(struct cachefs_block *block); + +static inline void cachefs_block_put(struct cachefs_block *block) +{ + if (block) { + int usage = atomic_read(&block->usage); + + if ((usage & 0xffffff00) == 0x6b6b6b00) { + printk("\ncachefs_block_put(%p{u=%d})\n", + block, usage); + BUG(); + } + + BUG_ON(usage <= 0); + if (atomic_dec_and_test(&block->usage)) + __cachefs_block_put(block); + } +} + +static inline struct cachefs_block *__cachefs_get_page_block(struct page *page) +{ + BUG_ON(!PagePrivate(page)); + return ((struct cachefs_page *) page->private)->mapped_block; +} + +static inline void cachefs_page_modify(struct cachefs_super *super, + struct page **page) +{ + cachefs_block_modify(super, __cachefs_get_page_block(*page), page); +} + +extern void cachefs_block_withdraw(struct cachefs_super *super); + +/*****************************************************************************/ +/* + * data file or index object cookie + * - a file will only appear in one cache + * - a request to cache a file may or may not be honoured, subject to + * constraints such as disc space + * - indexes files are created on disc just-in-time + */ +struct cachefs_cookie +{ + atomic_t usage; /* number of users of this cookie */ + atomic_t children; /* number of children of this cookie */ + struct cachefs_index_def *idef; /* index definition */ + struct cachefs_cookie *iparent; /* index holding this entry */ + struct list_head search_results; /* results of searching iparent */ + struct list_head backing_inodes; /* inode(s) backing this file/index */ + struct rw_semaphore sem; + struct cachefs_netfs *netfs; /* owner network fs definition */ + void *netfs_data; /* back pointer to netfs */ +}; + +struct cachefs_search_result { + struct list_head link; /* link in search_results */ + struct cachefs_super *super; /* superblock searched */ + unsigned ino; /* inode number (or 0 if negative) */ +}; + +extern kmem_cache_t *cachefs_cookie_jar; + +extern void cachefs_cookie_init_once(void *_cookie, kmem_cache_t *cachep, unsigned long flags); + +/*****************************************************************************/ +/* + * on-disc per-cache inode record + */ +struct cachefs_inode +{ + struct inode vfs_inode; /* VFS inode record for this file */ + + struct cachefs_block *metadata; /* block containing metadata */ + struct page *metadata_page; /* page mapped to metadata block */ + struct rw_semaphore metadata_sem; /* metadata page access semaphore */ + unsigned short metadata_offset; /* metadata record offset */ + + unsigned short index_dsize; /* size of data in each index entry */ + unsigned short index_esize; /* size of index entries */ + unsigned short index_epp; /* number of index entries per page */ + + unsigned long flags; +#define CACHEFS_ACTIVE_INODE_ISINDEX 0 /* T if inode is index file (F if file) */ +#define CACHEFS_ACTIVE_INODE_RELEASING 1 /* T if inode is being released */ +#define CACHEFS_ACTIVE_INODE_RECYCLING 2 /* T if inode is being retired */ +#define CACHEFS_ACTIVE_INODE_WITHDRAWN 3 /* T if inode has been withdrawn */ + + struct list_head super_link; /* link in super->ino_list */ + struct list_head cookie_link; /* link in cookie->backing_inodes */ + struct cachefs_cookie *cookie; /* netfs's file/index object */ +}; + +extern struct inode_operations cachefs_status_inode_operations; +extern struct file_operations cachefs_status_file_operations; + +#define CACHEFS_FS_I(inode) \ + container_of((inode), struct cachefs_inode, vfs_inode) + +extern struct cachefs_inode *cachefs_iget(struct cachefs_super *super, + ino_t ino); +extern int cachefs_write_inode(struct inode *_inode, int sync); +extern void cachefs_clear_inode(struct inode *vfs_inode); + +static inline struct cachefs_inode *cachefs_igrab(struct cachefs_inode *iinode) +{ + struct inode *inode = igrab(&iinode->vfs_inode); + return inode ? CACHEFS_FS_I(inode) : NULL; +} + +static inline void cachefs_iput(struct cachefs_inode *inode) +{ + if (inode) + iput(&inode->vfs_inode); +} + +extern struct page *cachefs_get_page(struct cachefs_inode *inode, + unsigned index); + +static inline void cachefs_put_page(struct page *page) +{ + if (page) + page_cache_release(page); +} + +extern int cachefs_sync_page(struct page *page); +extern int cachefs_invalidatepage(struct page *page, unsigned long offset); +extern int cachefs_releasepage(struct page *page, int gfp_flags); +extern int cachefs_no_writepage(struct page *page, + struct writeback_control *wbc); +extern int cachefs_no_writepages(struct address_space *mapping, + struct writeback_control *wbc); +extern int cachefs_no_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to); +extern int cachefs_no_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to); +extern int cachefs_no_set_page_dirty(struct page *page); + +extern int cachefs_io_pages_read(struct bio *bio, unsigned int bytes_done, + int err); + +extern int cachefs_io_alloc(struct super_block *sb, + sector_t first_sector, int nr_vecs, int gfp_flags, + struct bio **_bio); + +static inline +struct cachefs_ondisc_metadata *cachefs_metadata_preread(struct cachefs_inode *inode) +{ + down_read(&inode->metadata_sem); + return kmap_atomic(inode->metadata_page, KM_USER0) + + inode->metadata_offset; +} + +static inline +void cachefs_metadata_postread(struct cachefs_inode *inode, + struct cachefs_ondisc_metadata *metadata) +{ + kunmap_atomic(metadata, KM_USER0); + up_read(&inode->metadata_sem); +} + +static inline +struct cachefs_ondisc_metadata *cachefs_metadata_prewrite(struct cachefs_inode *inode) +{ + down_write(&inode->metadata_sem); + cachefs_block_modify(inode->metadata->super, inode->metadata, + &inode->metadata_page); + return kmap_atomic(inode->metadata_page, KM_USER0) + + inode->metadata_offset; +} + +static inline +void cachefs_metadata_postwrite(struct cachefs_inode *inode, + struct cachefs_ondisc_metadata *metadata) +{ + kunmap_atomic(metadata, KM_USER0); + up_write(&inode->metadata_sem); +} + +extern void cachefs_withdraw_inode(struct cachefs_inode *inode); + +extern int cachefs_index_search(struct cachefs_inode *index, + struct cachefs_cookie *target, + unsigned *_entry, + unsigned *_ino); + +extern int cachefs_index_add(struct cachefs_inode *index, + struct cachefs_cookie *cookie, + unsigned *_newino); + +extern int cachefs_index_update(struct cachefs_inode *index); + +extern int cachefs_index_reclaim_one_entry(struct cachefs_super *super, + struct cachefs_transaction **_trans); + +/*****************************************************************************/ +/* + * record of as-yet invalid data block for which a v-journal entry exists + */ +struct cachefs_vj_entry +{ + struct list_head link; + cachefs_blockix_t bix; + unsigned ino; /* inode to which applies */ + unsigned pgnum; /* page in inode */ + unsigned vslot; /* v-journal slot in which mark stored */ + struct page *vpage; /* page holding vblock */ + struct cachefs_block *vblock; /* v-journal block in which mark stored */ + unsigned ventry; /* offset in vblock at which mark stored */ + unsigned upblock; /* block in which pointer stored */ + unsigned upentry; /* offset in upblock at which pointer stored */ + int written; /* set when written */ +}; + +extern int cachefs_vj_alloc(struct cachefs_transaction *trans, + struct cachefs_inode *inode); +extern void cachefs_vj_release(struct cachefs_super *super, + struct cachefs_vj_entry *vjentry); +extern void cachefs_vj_cancel(struct cachefs_block *block); +extern void cachefs_vj_write_complete(struct cachefs_block *block); +extern void cachefs_vj_note_write_completion(struct cachefs_super *super); +extern int cachefs_vj_replay(struct cachefs_super *super); + + +/*****************************************************************************/ +/* + * transaction record and tracking structures + * - these record the modification of metadata (and not, generally, ordinary data) + */ +enum cachefs_trans_phase { + CACHEFS_TRANS_PREPARING, /* mark is being prepared */ + CACHEFS_TRANS_MARKED, /* mark has been made */ + CACHEFS_TRANS_COMMITTING, /* mark has been committed and is being written */ + CACHEFS_TRANS_DEAD /* mark is complete */ +} __attribute__((packed)); + +struct cachefs_trans_effect +{ + struct cachefs_block *block; + struct page *held_page; /* page on hold till writeback complete */ +}; + +#define CACHEFS_EFFECTS_PER_TRANS 4 + +struct cachefs_transaction +{ + int16_t batch; /* batch this mark belongs to */ + uint16_t serial; /* serial number within batch */ + enum cachefs_trans_phase phase; /* current phase of ACK */ + unsigned short index; /* index in u-journal of mark sector */ + + struct cachefs_ondisc_update_journal *jentry; /* update journal entry buffer + * - alloc'd when transaction allocated + * - freed when transaction committed */ + + struct cachefs_block *jblock; /* block holding ondisc u-journal entry */ + struct page *jpage; /* page holding u-journal entry */ + struct cachefs_vj_entry *vjentry; /* associated v-journal entry */ + struct cachefs_super *super; + struct list_head sblink; /* next transaction in superblock's list */ + + atomic_t usage; + + /* keep track of special changes that must only take effect under + * certain circumstances */ + uint16_t changed; +#define CACHEFS_TRANS_CHANGED_ALLOC 0x0001 /* alloc stack/leaf changed */ +#define CACHEFS_TRANS_CHANGED_RECYCLE 0x0002 /* recycle stack changed */ +#define CACHEFS_TRANS_CHANGED_RCMBLOCK 0x0004 /* inode/block being reclaimed changed */ +#define CACHEFS_TRANS_CHANGED_RCMPTR 0x0008 /* pointer being reclaimed changed */ + + /* tracking for blocks being modified by this transaction */ + unsigned eff_active; + struct cachefs_trans_effect effects[CACHEFS_EFFECTS_PER_TRANS]; +}; + +/* record of unjournalled alteration */ +struct cachefs_alteration +{ + struct cachefs_alteration *next; + struct cachefs_trans_effect effect; +}; + +extern +struct cachefs_transaction *cachefs_trans_alloc(struct cachefs_super *super, + unsigned long gfp); + +extern +struct cachefs_transaction * +cachefs_trans_alloc_replay(struct cachefs_super *super, + struct cachefs_ondisc_update_journal *jentry); + +extern void __cachefs_trans_put(struct cachefs_transaction *trans); +static inline void cachefs_trans_put(struct cachefs_transaction *trans) +{ + if (trans) + __cachefs_trans_put(trans); +} + +extern void cachefs_trans_affects_block(struct cachefs_transaction *trans, + struct cachefs_block *target, + unsigned offset, + unsigned size); + +static inline +void cachefs_trans_affects_page(struct cachefs_transaction *trans, + struct cachefs_page *pageio, + unsigned offset, + unsigned size) +{ + cachefs_trans_affects_block(trans, pageio->mapped_block, offset, size); +} + +static inline +void cachefs_trans_affects_inode(struct cachefs_transaction *trans, + struct cachefs_inode *inode) +{ + struct cachefs_super *super = inode->vfs_inode.i_sb->s_fs_info; + + cachefs_trans_affects_block(trans, + inode->metadata, + inode->metadata_offset, + super->layout->metadata_size); +} + +static inline void cachefs_trans_affects_super(struct cachefs_transaction *trans) +{ + struct cachefs_super *super = trans->super; + cachefs_trans_affects_page(trans, + cachefs_page_grab_private( + virt_to_page(super->layout)), + 0, + super->sb->s_blocksize); +} + +extern int cachefs_trans_mark(struct cachefs_transaction *trans); +extern void cachefs_trans_commit(struct cachefs_transaction *trans); +extern void cachefs_trans_commit_replay(struct cachefs_transaction *trans); +extern void cachefs_trans_batch_write(struct cachefs_super *super); +extern void cachefs_trans_batch_timer(unsigned long data); + +typedef enum { + CACHEFS_TRANS_SYNC_NOWAIT, /* don't wait - just begin write */ + CACHEFS_TRANS_SYNC_WAIT_FOR_MARK, /* wait until ujnl BATCH mark is written */ + CACHEFS_TRANS_SYNC_WAIT_FOR_ACK, /* wait until ujnl ACK mark is written */ +} cachefs_trans_syncwt_t; + +extern void cachefs_trans_sync(struct cachefs_super *super, + cachefs_trans_syncwt_t wait); + +extern int cachefs_ujnl_replay(struct cachefs_super *super); + +/*****************************************************************************/ +/* + * debug tracing + */ +#define dbgprintk(FMT,...) \ + printk("[%-6.6s] "FMT"\n",current->comm ,##__VA_ARGS__) +#define _dbprintk(FMT,...) do { } while(0) + +#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__) +#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__) +#define kdebug(FMT,...) dbgprintk(FMT ,##__VA_ARGS__) + +#define kjournal(FMT,...) _dbprintk(FMT ,##__VA_ARGS__) + +#define dbgfree(ADDR) _dbprintk("%p:%d: FREEING %p",__FILE__,__LINE__,ADDR) + +#define dbgpgalloc(PAGE) \ +do { \ + _dbprintk("PGALLOC %s:%d: %p {%lx,%lu}\n", \ + __FILE__,__LINE__, \ + (PAGE),(PAGE)->mapping->host->i_ino,(PAGE)->index \ + ); \ +} while(0) + +#define dbgpgfree(PAGE) \ +do { \ + if ((PAGE)) \ + _dbprintk("PGFREE %s:%d: %p {%lx,%lu}\n", \ + __FILE__,__LINE__, \ + (PAGE), \ + (PAGE)->mapping->host->i_ino, \ + (PAGE)->index \ + ); \ +} while(0) + +#ifdef __KDEBUG +#define _enter(FMT,...) kenter(FMT,##__VA_ARGS__) +#define _leave(FMT,...) kleave(FMT,##__VA_ARGS__) +#define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__) +#else +#define _enter(FMT,...) do { } while(0) +#define _leave(FMT,...) do { } while(0) +#define _debug(FMT,...) do { } while(0) +#endif + +extern void dump_bio(struct bio *bio, int n); + +#endif /* _LINUX_CACHEFS_INT_H */ diff -puN /dev/null fs/cachefs/cachefs-layout.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25-akpm/fs/cachefs/cachefs-layout.h 2005-06-26 13:42:55.000000000 -0700 @@ -0,0 +1,503 @@ +/* cachefs-layout.h: general filesystem caching on-disc layout + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _LINUX_CACHEFS_LAYOUT_H +#define _LINUX_CACHEFS_LAYOUT_H + +#include + +enum cachefs_meta_inode_numbers { + CACHEFS_INO_NULL = 0x00000000, + CACHEFS_INO_METADATA = 0x00000001, + CACHEFS_INO_FSDEF_CATALOGUE, + CACHEFS_INO_ROOTDIR = CACHEFS_INO_FSDEF_CATALOGUE, + CACHEFS_INO__FIRST_FILE, + + /* virtual files all have the top bit set */ + CACHEFS_INO_MISC = 0x80000000, + CACHEFS_INO_WBJOURNAL = 0x80000001, + CACHEFS_INO_STATUS = 0x80000002, +}; + +typedef uint32_t cachefs_blockix_t; + +/*****************************************************************************/ +/* + * cache superblock block layout + * - the blockdev is prepared for initialisation by + * 'echo "cachefs___" >/dev/hdaXX' before mounting + * - when initialised, the magic number is changed to "cachefsrdy" + */ +struct cachefs_ondisc_superblock +{ + uint8_t magic[10]; /* magic number */ +#define CACHEFS_SUPER_MAGIC "cachefsrdy" +#define CACHEFS_SUPER_MAGIC_NEEDS_INIT "cachefs___" +#define CACHEFS_SUPER_MAGIC_SIZE 10 + + uint16_t endian; /* 0x1234 stored CPU-normal order */ +#define CACHEFS_SUPER_ENDIAN 0x1234 + + uint32_t version; /* format version */ +#define CACHEFS_SUPER_VERSION 1 + + /* layout */ + uint32_t bsize; /* cache block size */ + uint32_t metadata_size; /* cache metadata record size */ + uint32_t metadata_bits; /* log2 cache metadata record size */ + uint32_t ujnl_rsize; /* update journal record size */ + uint32_t ujnl_recperblk; /* u-journal records per block */ + cachefs_blockix_t bix_ujournal; /* start of update journal */ + cachefs_blockix_t bix_vjournal; /* start of invalid block journal */ + cachefs_blockix_t bix_wbjournal; /* start of writeback journal */ + cachefs_blockix_t bix_cache; /* start of data cache */ + cachefs_blockix_t bix_unready; /* start of initially unallocated blocks */ + cachefs_blockix_t bix_end; /* start of end of cache */ +}; + +/*****************************************************************************/ +/* + * on-disc index entry header + */ +struct cachefs_ondisc_index_entry +{ + uint32_t state : 7; +#define CACHEFS_ONDISC_INDEX_FREE 0x7e /* entry can be allocated */ +#define CACHEFS_ONDISC_INDEX_RECYCLE 0x65 /* entry scheduled for recycling */ +#define CACHEFS_ONDISC_INDEX_ACTIVE 0x2c /* entry active */ +#define CACHEFS_ONDISC_INDEX_PINNED 0x43 /* entry pinned (metadata file only) */ + + uint32_t type : 1; +#define CACHEFS_ONDISC_INDEX_DATAFILE 0 +#define CACHEFS_ONDISC_INDEX_INDEXFILE 1 + + uint32_t ino : 24; /* inode containing catalogue/data */ + + union { + uint32_t freelink[0]; /* next free entry pointer */ + uint8_t data[0]; /* the index data */ + } u; +}; + +#define CACHEFS_ONDISC_INDEX_ENTRY_MINSIZE \ + (sizeof(struct cachefs_ondisc_index_entry) + sizeof(uint32_t)) + +/* index definition description */ +struct cachefs_ondisc_index_def +{ + uint16_t dsize; + uint16_t esize; + uint16_t keys[4]; + uint8_t type[8]; + +#define CACHEFS_ONDISC_INDEXKEY_KLEN 0x0FFF /* length of key segment */ +#define CACHEFS_ONDISC_INDEXKEY_TYPE 0xF000 /* type of key segment */ +#define CACHEFS_ONDISC_INDEXKEY_NOTUSED 0x0000 /* - segment not used */ +#define CACHEFS_ONDISC_INDEXKEY_BIN 0x1000 /* - binary data */ +#define CACHEFS_ONDISC_INDEXKEY_ASCIIZ 0x2000 /* - null-terminated string */ +#define CACHEFS_ONDISC_INDEXKEY_IPV4 0x3000 /* - IPv4 address */ +#define CACHEFS_ONDISC_INDEXKEY_IPV6 0x4000 /* - IPv6 address */ + + uint8_t data[0]; +}; + +/*****************************************************************************/ +/* + * on-disc metadata record + * - padded out to sector size and stored several to a block + * - only the data version is necessary + * - disconnected operation is not supported + * - afs_iget() contacts the server to get the meta-data _anyway_ when an + * inode is first brought into memory + * - at least 64 direct block pointers will be available + * - any block pointer which is 0 indicates an uncached page + */ +struct cachefs_ondisc_metadata +{ + struct cachefs_ondisc_index_entry header; + + uint32_t freelink; /* head of free entry list (or UINT_MAX) */ + uint32_t atime; /* last access time */ + uint32_t mtime; /* last modification time */ + uint32_t pindex; /* parent index ID (0 for top of tree) */ + uint32_t pindex_entry; /* parent index entry number */ + uint64_t size; /* size of file */ + + /* index file definition */ + struct cachefs_ondisc_index_def index; + + /* file contents - recycling depends on triple_indirect being first */ + cachefs_blockix_t triple_indirect; /* triple indirect block index */ + cachefs_blockix_t double_indirect; /* double indirect block index */ + cachefs_blockix_t single_indirect; /* single indirect block index */ + cachefs_blockix_t direct[0]; /* direct block ptrs */ +}; + +/*****************************************************************************/ +/* + * on-disc cached network filesystem definition record + * - each entry resides in its own sector + */ +struct cachefs_ondisc_fsdef +{ + uint8_t name[24]; /* name of netfs */ + uint32_t version; /* version of layout */ +}; + +/*****************************************************************************/ +/* + * Free blocks are kept in pair of a very one sided trees (more horsetail + * plants than trees) + * + * +---------+ +---------+ +---------+ +---------+ + * stk--->| |--->| |--->| |--->| |---> NULL + * | NODE | | NODE | | NODE | | NODE | + * | | | | | | | | + * +---------+ +---------+ +---------+ +---------+ + * / | \ / | \ / | \ / | \ + * free blocks free blocks free blocks free blocks + * + * - each free block is on one of two trees, both pointed to by the ujournal: + * - the "recycling stack" - all newly freed blocks end up on here + * - the "alloc stack" - all allocations are popped off here + * - when the alloc stack is empty, the recycling stack is transferred into + * it + * - the front node on the alloc stack is the current source of block + * allocations + * - when all a node's leaves have been allocated, then the node itself will + * be allocated + * - the front node on the recycling stack is the current sink of recycled + * blocks + */ +struct cachefs_ondisc_free_node +{ + cachefs_blockix_t next; /* next node in free tree */ + uint32_t count; /* number of blocks in tree after this one */ + cachefs_blockix_t leaves[0]; /* free blocks depending from this block */ +}; + +#define CACHEFS_ONDISC_LEAVES_PER_FREE_NODE \ + ((PAGE_SIZE - sizeof(struct cachefs_ondisc_free_node)) / sizeof(cachefs_blockix_t)) + +/*****************************************************************************/ +/* + * on-disc update journal + * - records changes being made to disc content, particularly the metadata + * - the serial number cycles through in ascending order + * - ACKs specify everything between "index" & "block" as being complete + * - serial numbers can wrap, but can't go into window of un-ACK'd marks + * - journal slots are the size of a sector (blockdev block size) + * - this means that two adjacent marks are made on separate sectors, and so + * the second doesn't have to wait for the first to be written to disc + * - the current slot allocation point is not permitted to lap the currently + * un-ACK'd slots - the requestor must wait + */ +enum cachefs_ondisc_ujnl_mark { + /* NULL mark */ + CACHEFS_ONDISC_UJNL_NULL, + + /* batch stop mark */ + CACHEFS_ONDISC_UJNL_BATCH, + + /* batch completion mark */ + CACHEFS_ONDISC_UJNL_ACK, + + /* beginning new recycle_stk front node + * - block = block being begun + * - index = old front recycling node + * - ixentry = old front recycling node's count + * - upblock = block from which transferred (or 0 if from unready list) + * - upentry = entry in upblock[] + * - pgnum = new super->layout.bix_unready + */ + CACHEFS_ONDISC_UJNL_RECYC_BEGIN_NEW, + + /* transfer recycle_stk to alloc_stk + * - block = front block being transferred + * - upblock = 0 or else block at TOS of recycling stack if this was 2OS + */ + CACHEFS_ONDISC_UJNL_RECYC_TRANSFER, + + /* scavenge sets of pointers from super->rcyblock + * - block = block holding pointer array being processed + * - entry = index into block[] of first pointer transferred + * - auxblock = recycling node that dependents are transferred to + * - auxentry = index into auxblock[] of first leaf filled + * - count = number of pointers transferred + */ + CACHEFS_ONDISC_UJNL_RECYC_SCAVENGE, + + /* transfer bix_unready to recycle_stk + * - block = recycling node that blocks were pasted into + * - entry = index into block[] of first pointer inserted + * - auxblock = first unready block transferred + * - pgnum = new super->layout.bix_unready + * - count = number of blocks pasted + */ + CACHEFS_ONDISC_UJNL_RECYC_MAKEREADY, + + /* data file being created + * - index = parent index being attached to + * - ixentry = entry in parent index + * - pgnum = page in file holding index entry being allocated + * - block = block holding index entry being allocated + * - entry = offset of entry in block + * - ino = inode being attached to hold index contents + * - auxblock = metadata file block holding inode metadata + * - auxentry = offset of entry in auxblock + * - upblock = metadata file block holding index metadata + * - upentry = offset of entry in upblock + * - count = size of index entry in block + * - ixdata = index data + * - next_ino = next free metadata file entry + * - next_index = next free index file entry + */ + CACHEFS_ONDISC_UJNL_INODE_CREATING, + + /* data file being updated */ + CACHEFS_ONDISC_UJNL_INODE_UPDATING, + + /* data or index file being deleted + * - index = parent index being attached to [opt] + * - ixentry = entry in parent index [opt] + * - pgnum = page in file holding index entry being allocated [opt] + * - block = block holding index entry being allocated [opt] + * - entry = offset of entry in block [opt] + * - ino = inode being attached to hold index contents + * - auxblock = metadata file block holding inode metadata + * - auxentry = offset of entry in auxblock + * - upblock = metadata file block holding index metadata [opt] + * - upentry = offset of entry in upblock [opt] + * - count = size of index entry in block [opt] + * - next_ino = next free metadata file entry + * - next_index = next free index file entry [opt] + */ + CACHEFS_ONDISC_UJNL_INODE_DELETING, + + /* inode being marked for reclamation + * - ino = target inode + * - index = inode's parent index + * - ixentry = inode's parent index entry + * - pgnum = page in index holding entry being marked + * - block = metadata file block holding index metadata + * - entry = offset of entry in upblock + * - auxblock = metadata file block holding inode metadata + * - auxentry = offset of entry in auxblock + */ + CACHEFS_ONDISC_UJNL_INODE_MARK_RECLAIM, + + /* inode being reclaimed + * - ino = target inode + * - index = inode's parent index + * - ixentry = inode's parent index entry + * - pgnum = page in index holding entry being marked + * - block = metadata file block holding index metadata + * - entry = offset of entry in upblock + * - auxblock = metadata file block holding inode metadata + * - auxentry = offset of entry in auxblock + */ + CACHEFS_ONDISC_UJNL_INODE_RECLAIMING, + + /* data file block allocation + * - ino = inode for which block allocated + * - pgnum = page of inode being instantiated + * - size = current file size + * - block = block allocated + * - auxblock = block holding inode's metadata + * - auxentry = offset in auxblock of metadata record + * - upblock = block which will point to this one + * - upentry = entry in block pointing to this one + * - auxmark = v-journal entry number + */ + CACHEFS_ONDISC_UJNL_DATA_ALLOCING, + + /* completed write on page in cache + * - ino = inode for which block was written + * - pgnum = which page of inode was written + * - block = block written + * - auxmark = v-journal entry number + */ + CACHEFS_ONDISC_UJNL_DATA_WRITTEN, + + /* data block being unallocated + * - index = old front recycling node + * - ixentry = old front recycling node's count + * - ino = inode to which block belongs + * - pgnum = which page of inode being unallocated + * - block = block being recycled + * - auxblock = (old) front recycling node + * - auxentry = index into auxblock[] of leaf filled (or UINT_MAX if new node) + * - upblock = block from which transferred + * - upentry = entry in upblock[] + * - auxmark = v-journal entry number + */ + CACHEFS_ONDISC_UJNL_DATA_UNALLOCING, + + /* indirect block being allocated + * - auxmark = which level being allocated + * - ino = inode for which block is being allocated + * - pgnum = which page of inode being allocated + * - size = current file size + * - block = block being allocated + * - auxblock = block holding inode's metadata + * - auxentry = offset in auxblock of metadata record + * - upblock = block which will point to this one + * - upentry = entry in block pointing to this one + */ + CACHEFS_ONDISC_UJNL_INDIRECT_ALLOCING, + + /* index file being extended (as for data block allocation) + * - ino = index inode + * - pgnum = page in file holding index entry being allocated + * - size = current file size + * - block = new block being allocated + * - auxblock = metadata file block holding index metadata + * - auxentry = offset of entry in auxblock + * - upblock = block holding pointer to new block + * - upentry = offset of entry in upblock + * - count = size of index entry (inc header) in block + * - next_index = next free index file entry + */ + CACHEFS_ONDISC_UJNL_INDEX_EXTENDING, + + /* index file being created + * - index = parent index being attached to + * - ixentry = entry in parent index + * - pgnum = page in file holding index entry being allocated + * - block = block holding index entry being allocated + * - entry = offset of entry in block + * - ino = inode being attached to hold index contents + * - auxblock = metadata file block holding inode metadata + * - auxentry = offset of entry in auxblock + * - upblock = metadata file block holding index metadata + * - upentry = offset of entry in upblock + * - count = size of index entry in block + * - ixdata = index definition and data + * - next_ino = next free metadata file entry + * - next_index = next free index file entry + */ + CACHEFS_ONDISC_UJNL_INDEX_CREATING, + + /* index entry being updated + * - index = index being modified + * - ixentry = entry in index + * - pgnum = page in file holding index entry being allocated + * - block = block holding index entry being allocated + * - entry = offset of entry in block + * - count = size of entry in block + * - ixdata = revised index data + */ + CACHEFS_ONDISC_UJNL_INDEX_UPDATING, + + CACHEFS_ONDISC_UJNL__LAST +} __attribute__((packed)); + +struct cachefs_ondisc_ujnl_index { + struct cachefs_ondisc_index_def def; + uint32_t next_ino; /* next inode entry */ + uint32_t next_index; /* next index entry */ + uint8_t data[0]; +}; + +struct cachefs_ondisc_update_journal +{ + enum cachefs_ondisc_ujnl_mark mark; + + uint32_t auxmark; +#define CACHEFS_ONDISC_UJNL_SINGLE_0 0 /* single indirect (1 of) */ +#define CACHEFS_ONDISC_UJNL_DOUBLE_0 1 /* double indirect level 0 (1 of) */ +#define CACHEFS_ONDISC_UJNL_DOUBLE_1 2 /* double indirect level 1 (1K of) */ +#define CACHEFS_ONDISC_UJNL_TRIPLE_0 3 /* triple indirect level 0 (1 of) */ +#define CACHEFS_ONDISC_UJNL_TRIPLE_1 4 /* triple indirect level 1 (1K of) */ +#define CACHEFS_ONDISC_UJNL_TRIPLE_2 5 /* triple indirect level 2 (1M of) */ + + int16_t batch; /* batch number */ + uint16_t serial; /* serial number of entry in batch */ + uint32_t ino; /* in-cache inode number */ + uint32_t pgnum; + uint32_t size; + uint32_t index; + uint32_t ixentry; + uint16_t entry; + uint16_t auxentry; + uint16_t upentry; + uint16_t rcm_ptrnext; /* next ptr in rcm_block to be reclaimed */ + uint16_t rcm_ptrstop; /* last ptr in rcm_block + 1 */ + uint16_t count; + uint16_t alloc_leaf; /* current alloc point in alloc_cur */ + uint16_t rcm_indirect; /* indirect block being reclaimed */ + uint32_t rcm_ino; /* number of inode being reclaimed */ + cachefs_blockix_t block; + cachefs_blockix_t auxblock; + cachefs_blockix_t upblock; + cachefs_blockix_t rcm_block; /* block currently being reclaimed */ + cachefs_blockix_t alloc_cur; /* current block allocation node */ + cachefs_blockix_t recycle_cur; /* current block recycling node */ + + union { + /* recycled pointers */ + cachefs_blockix_t rcyptrs[0]; + + /* new/updated index entry */ + struct cachefs_ondisc_ujnl_index ixdata[0]; + + /* miscellaneous data */ + uint8_t data[0]; + } u; +}; + +#define CACHEFS_ONDISC_UJNL_NUMENTS 4096 /* number of entries in the u-journal */ +#define CACHEFS_ONDISC_UJNL_MIN_REC_SIZE 512 /* minimum u-journal record size */ + +/*****************************************************************************/ +/* + * on-disc block validity journal + * - blocks noted here don't yet have valid data downloaded from the remote + * server + * - unused entries have ino==0 + * - changed under the influence of the u-journal + */ +struct cachefs_ondisc_validity_journal +{ + uint32_t ino; /* inode number */ + uint32_t pgnum; /* page within inode */ +}; + +#define CACHEFS_ONDISC_VJNL_ENTPERPAGE \ + (PAGE_SIZE / sizeof(struct cachefs_ondisc_validity_journal)) + +#define CACHEFS_ONDISC_VJNL_SIZE 16 /* blocks */ + +#define CACHEFS_ONDISC_VJNL_ENTS \ + (CACHEFS_ONDISC_VJNL_ENTPERPAGE * CACHEFS_ONDISC_VJNL_SIZE) + +/*****************************************************************************/ +/* + * on-disc writeback journal + * - records pages that are pending being written back to the server + */ +struct cachefs_ondisc_writeback_journal +{ + uint32_t ino; /* in-cache inode number */ + uint32_t size; /* size of changed region */ + uint64_t fpos; /* start file position */ + uint8_t fsdata[8]; /* FS-specific data */ +}; + +#define CACHEFS_ONDISC_WBJNL_ENTPERPAGE \ + (PAGE_SIZE / sizeof(struct cachefs_ondisc_writeback_journal)) + +#define CACHEFS_ONDISC_WBJNL_SIZE 128 /* blocks */ + +#define CACHEFS_ONDISC_WBJNL_ENTS \ + (CACHEFS_ONDISC_WBJNL_ENTPERPAGE * CACHEFS_ONDISC_WBJNL_SIZE) + +#endif /* _LINUX_CACHEFS_LAYOUT_H */ diff -puN /dev/null fs/cachefs/index.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25-akpm/fs/cachefs/index.c 2005-06-26 13:42:55.000000000 -0700 @@ -0,0 +1,970 @@ +/* index.c: general filesystem cache: index file management + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * - all index files are arranged in pages + * - each page contains an array of fixed length records + * - the length recorded in the metadata data for that file + * - each page will have a gap at the end if the records don't fit exactly + * - normally all pages will be allocated and there won't be any holes + * - the metadata records file is the only exception to this + * - each file maintains a list of allocated but currently unused entries + */ + +#include +#include +#include +#include +#include +#include +#include +#include "cachefs-int.h" + +struct cachefs_index_search_record { + struct cachefs_cookie *index; + struct cachefs_cookie *target; + struct cachefs_inode *iinode; + unsigned entsize; + unsigned ino; + unsigned entry; +}; + +/*****************************************************************************/ +/* + * mark an inode/index entry pair for deletion when so requested by the match + * function supplied by the netfs + */ +static void cachefs_index_search_delete(struct cachefs_index_search_record *rec, + struct page *ixpage, + unsigned ixentry, + unsigned ixoffset, + unsigned ino) +{ + struct cachefs_ondisc_index_entry *xent; + struct cachefs_ondisc_metadata *metadata; + struct cachefs_transaction *trans; + struct cachefs_super *super; + struct cachefs_inode *inode; + unsigned long flags; + int ret; + + _enter(",{%lx},%u,%u,%u", ixpage->index, ixentry, ixoffset, ino); + + _debug("SEARCH/DELETE %u", ino); + + super = ixpage->mapping->host->i_sb->s_fs_info; + + /* get the index file inode */ + inode = cachefs_iget(super, ino); + if (IS_ERR(inode)) { + _leave(" [iget error %ld]", PTR_ERR(inode)); + return; + } + + BUG_ON(!list_empty(&inode->cookie_link)); + + /* create a transaction to record the reclamation */ + ret = -ENOMEM; + trans = cachefs_trans_alloc(super, GFP_KERNEL); + if (!trans) + goto error; + + trans->jentry->mark = CACHEFS_ONDISC_UJNL_INODE_MARK_RECLAIM; + trans->jentry->ino = inode->vfs_inode.i_ino; + trans->jentry->index = rec->iinode->vfs_inode.i_ino; + trans->jentry->ixentry = ixentry; + trans->jentry->pgnum = ixpage->index; + trans->jentry->block = __cachefs_get_page_block(ixpage)->bix; + trans->jentry->entry = ixoffset; + trans->jentry->auxblock = inode->metadata->bix; + trans->jentry->auxentry = inode->metadata_offset; + + cachefs_trans_affects_page(trans, cachefs_page_grab_private(ixpage), + ixoffset, sizeof(*xent)); + cachefs_trans_affects_inode(trans, inode); + + /* record the transaction in the journal */ + ret = cachefs_trans_mark(trans); + if (ret < 0) + goto error; + + /* change the parent index entry and the index's inode entry as to the + * recycle state */ + cachefs_page_modify(super, &ixpage); + + xent = kmap_atomic(ixpage, KM_USER0) + ixoffset; + xent->state = CACHEFS_ONDISC_INDEX_RECYCLE; + kunmap_atomic(xent, KM_USER0); + + metadata = cachefs_metadata_prewrite(inode); + metadata->header.state = CACHEFS_ONDISC_INDEX_RECYCLE; + cachefs_metadata_postwrite(inode, metadata); + + /* commit the changes to disc */ + cachefs_trans_commit(trans); + + /* attempt to schedule for immediate reclamation */ + spin_lock_irqsave(&super->rcm_lock, flags); + + if (CIRC_SPACE(super->rcm_imm_head, + super->rcm_imm_tail, + CACHEFS_RCM_IMM_BUFSIZE) > 0 + ) { + super->rcm_imm_buf[super->rcm_imm_head] = + inode->vfs_inode.i_ino; + super->rcm_imm_head = + (super->rcm_imm_head + 1) & + (CACHEFS_RCM_IMM_BUFSIZE - 1); + } + else { + set_bit(CACHEFS_SUPER_RCM_IMM_SCAN, &super->flags); + } + + spin_unlock_irqrestore(&super->rcm_lock, flags); + + /* wake up kcachefsd */ + set_bit(CACHEFS_SUPER_DO_RECLAIM, &super->flags); + wake_up(&super->dmn_sleepq); + + /* done */ + cachefs_iput(inode); + _leave(" [ok]"); + return; + + error: + cachefs_iput(inode); + cachefs_trans_put(trans); + _leave(" [error %d]", ret); + return; + +} /* end cachefs_index_search_delete() */ + +/*****************************************************************************/ +/* + * mark an inode/index entry pair for deletion when so requested by the match + * function supplied by the netfs + */ +static void cachefs_index_search_update(struct cachefs_index_search_record *rec, + struct page *ixpage, + unsigned ixentry, + unsigned ixoffset, + unsigned ino) +{ + struct cachefs_ondisc_index_entry *xent; + struct cachefs_transaction *trans; + struct cachefs_super *super; + int ret; + + _enter(",{%lx},%u,%u,%u", ixpage->index, ixentry, ixoffset, ino); + + super = ixpage->mapping->host->i_sb->s_fs_info; + + /* create a transaction to record the update */ + ret = -ENOMEM; + trans = cachefs_trans_alloc(super, GFP_KERNEL); + if (!trans) + goto error; + + trans->jentry->mark = CACHEFS_ONDISC_UJNL_INDEX_UPDATING; + trans->jentry->ino = ino; + trans->jentry->index = rec->iinode->vfs_inode.i_ino; + trans->jentry->ixentry = ixentry; + trans->jentry->pgnum = ixpage->index; + trans->jentry->block = __cachefs_get_page_block(ixpage)->bix; + trans->jentry->entry = ixoffset; + trans->jentry->count = rec->iinode->index_dsize; + + cachefs_trans_affects_page(trans, cachefs_page_grab_private(ixpage), + ixoffset, sizeof(*xent)); + + /* have the netfs transcribe the update into the transaction */ + rec->index->idef->update(rec->target->netfs_data, + trans->jentry->u.ixdata[0].data); + + /* record the transaction in the journal */ + ret = cachefs_trans_mark(trans); + if (ret < 0) + goto error; + + /* actually change the index entry in the page cache */ + cachefs_page_modify(super, &ixpage); + + xent = kmap_atomic(ixpage, KM_USER0) + ixoffset; + memcpy(xent->u.data, + trans->jentry->u.ixdata[0].data, + rec->iinode->index_dsize); + kunmap_atomic(xent, KM_USER0); + + /* commit the changes to disc */ + cachefs_trans_commit(trans); + _leave(" [ok]"); + return; + + error: + cachefs_trans_put(trans); + _leave(" [error %d]", ret); + return; + +} /* end cachefs_index_search_update() */ + +/*****************************************************************************/ +/* + * index file search actor + * - return size to continue, 0 to stop (search also stops when desc->count==0) + */ +static int cachefs_index_search_actor(read_descriptor_t *desc, + struct page *page, + unsigned long offset, + unsigned long size) +{ + struct cachefs_index_search_record *rec; + unsigned long stop, tmp, esize; + void *content; + int ret; + + _enter(",{%lu},%lu,%lu", page->index, offset, size); + + rec = (struct cachefs_index_search_record *) desc->arg.buf; + ret = size; + + /* round up to the first record boundary after the offset */ + tmp = offset; + offset += rec->entsize - 1; + offset -= offset % rec->entsize; + if (offset - tmp > size) + goto done; + + size -= offset - tmp; + + /* limit the search of this page to the amount specified in + * desc->count */ + stop = desc->count; + if (size < stop) + stop = size; + + esize = rec->entsize; + + /* search the elements on the page (ignoring the slack at the end) */ + content = kmap(page); + + for (; offset + esize <= stop; offset += esize) { + struct cachefs_ondisc_index_entry *xent = content + offset; + cachefs_match_val_t result; + unsigned ixentry; + + /* ignore invalid entries */ + if (xent->state == CACHEFS_ONDISC_INDEX_FREE || + xent->state == CACHEFS_ONDISC_INDEX_RECYCLE) + continue; + + ixentry = offset / esize; + ixentry += page->index * (PAGE_SIZE / esize); + + /* ask the netfs to judge the match */ + result = rec->index->idef->match(rec->target->netfs_data, + xent->u.data); + + switch (result) { + case CACHEFS_MATCH_SUCCESS_UPDATE: + /* the netfs said that it matched, but needs + * updating */ + cachefs_index_search_update(rec, page, ixentry, offset, + xent->ino); + + case CACHEFS_MATCH_SUCCESS: + /* the netfs said that it matched */ + rec->entry = tmp; + rec->ino = xent->ino; + + if (rec->ino == 0) { + printk("CacheFS: Unexpected 0 inode number in" + " index %lu ent %u {%lu [%u] +%lu}\n", + rec->iinode->vfs_inode.i_ino, + rec->entry, + page->index, + __cachefs_get_page_block(page)->bix, + offset / esize); + BUG(); + } + + desc->count = 0; + ret = 0; + break; + + case CACHEFS_MATCH_SUCCESS_DELETE: + /* the netfs said that it matched, but this entry + * should be marked obsolete */ + cachefs_index_search_delete(rec, page, ixentry, offset, + xent->ino); + + case CACHEFS_MATCH_FAILED: + /* the netfs said there wasn't a valid match */ + default: + break; + } + } + + kunmap(page); + + done: + desc->count -= ret; + desc->written += ret; + + _leave(" = %d", ret); + return ret; + +} /* end cachefs_index_search_actor() */ + +/*****************************************************************************/ +/* + * search for the specified target object in an index in one cache + * - returns -ENOENT if not found + * - returns 0 if found, and stores the entry number in *_entry and the inode + * number of the backing file in *_ino + */ +int cachefs_index_search(struct cachefs_inode *index, + struct cachefs_cookie *target, + unsigned *_entry, + unsigned *_ino) +{ + struct cachefs_index_search_record rec; + struct file_ra_state ra; + read_descriptor_t desc; + loff_t pos; + int ret; + + _enter("{%s,%lu,%Lu}", + index->cookie->idef->name, + index->vfs_inode.i_ino, + i_size_read(index->vfs_inode)); + + if (_entry) + *_entry = UINT_MAX; + if (_ino) + *_ino = 0; + + ret = -ENOENT; + if (i_size_read(&index->vfs_inode) == 0) + goto out; + + /* prepare a record of what we want to do */ + rec.iinode = index; + rec.index = index->cookie; + rec.target = target; + rec.entsize = rec.iinode->index_esize; + rec.entry = UINT_MAX; + rec.ino = 0; + + /* scan the file through the pagecache, making use of readahead */ + memset(&ra, 0, sizeof(ra)); + file_ra_state_init(&ra, rec.iinode->vfs_inode.i_mapping); + + desc.written = 0; + desc.count = i_size_read(&rec.iinode->vfs_inode); + desc.arg.buf = (char *) &rec; + desc.error = 0; + + pos = 0; + + do_generic_mapping_read(rec.iinode->vfs_inode.i_mapping, &ra, NULL, + &pos, &desc, cachefs_index_search_actor); + + if (desc.error) { + /* we got an error */ + ret = desc.error; + } + else if (rec.entry == UINT_MAX) { + /* we didn't find an entry */ + ret = -ENOENT; + } + else { + /* we found an entry */ + BUG_ON(rec.ino == 0); + + if (_entry) + *_entry = rec.entry; + if (_ino) + *_ino = rec.ino; + ret = 0; + } + + out: + _leave(" = %d [ent=%d ino=%u]", ret, rec.entry, rec.ino); + return ret; + +} /* end cachefs_index_search() */ + +/*****************************************************************************/ +/* + * initialise a new index page (called in lieu of readpage) + */ +static int cachefs_index_preinit_page(void *data, struct page *page) +{ + struct cachefs_page *pageio; + + _enter(",%p{%lu}", page, page->index); + + /* attach a mapping cookie to the page */ + pageio = cachefs_page_get_private(page, GFP_KERNEL); + if (IS_ERR(pageio)) { + _leave(" = %ld", PTR_ERR(pageio)); + return PTR_ERR(pageio); + } + + /* clear the page */ + clear_highpage(page); + + /* done */ + SetPageUptodate(page); + unlock_page(page); + _leave(" = 0"); + return 0; + +} /* end cachefs_index_preinit_page() */ + +/*****************************************************************************/ +/* + * select a new entry in an index file, extending the file if necessary + */ +static int cachefs_index_select_free_entry(struct cachefs_inode *iinode, + struct page **_page, + unsigned *_newentry, + unsigned *_next) +{ + struct cachefs_ondisc_index_entry *xent; + struct cachefs_ondisc_metadata *metadata; + struct page *page; + unsigned newentry, pgnum, offset, next; + int ret; + + _enter("{%lu},", iinode->vfs_inode.i_ino); + + *_page = NULL; + *_newentry = 0; + *_next = 0; + + /* find the next free entry pointer from the metadata record for this + * inode */ + metadata = cachefs_metadata_preread(iinode); + newentry = metadata->freelink; + cachefs_metadata_postread(iinode, metadata); + + _debug("free entry: %u [size %Lu]", + newentry, i_size_read(iinode->vfs_inode)); + + /* extend the index file if there are no new entries */ + if (newentry == UINT_MAX) { + pgnum = i_size_read(&iinode->vfs_inode) >> PAGE_SHIFT; + + /* we need to get the new contents for this block ready in + * advance */ + page = read_cache_page(iinode->vfs_inode.i_mapping, pgnum, + cachefs_index_preinit_page, NULL); + dbgpgalloc(page); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto error; + } + + /* get a block to back the new page with */ + i_size_write(&iinode->vfs_inode, + i_size_read(&iinode->vfs_inode) + PAGE_SIZE); + + ret = cachefs_indr_io_get_block(&iinode->vfs_inode, page, + cachefs_page_grab_private(page), + 1); + if (ret < 0) { + i_size_write(&iinode->vfs_inode, + i_size_read(&iinode->vfs_inode) - + PAGE_SIZE); + goto error2; + } + + /* that will have populated the free list */ + metadata = cachefs_metadata_preread(iinode); + newentry = metadata->freelink; + cachefs_metadata_postread(iinode, metadata); + + BUG_ON(newentry == UINT_MAX); + _debug("done"); + } + /* otherwise we read the page holding the next free entry from disc */ + else { + filler_t *filler = + (filler_t *) + iinode->vfs_inode.i_mapping->a_ops->readpage; + + if (!iinode->index_epp) { + printk("CacheFS:" + " Index %lu {meta %u+%u} has zero-sized entries" + " (%hu/%hu/%hu)\n", + iinode->vfs_inode.i_ino, + iinode->metadata->bix, + iinode->metadata_offset, + iinode->index_dsize, + iinode->index_esize, + iinode->index_epp); + BUG(); + } + + /* do the read of the appropriate page */ + pgnum = newentry / iinode->index_epp; + page = read_cache_page(iinode->vfs_inode.i_mapping, pgnum, + filler, NULL); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto error; + } + + dbgpgalloc(page); + } + + /* read the next free entry pointer from the index entry we're about to + * fill in */ + BUG_ON(!__cachefs_get_page_block(page)); + + offset = (newentry % iinode->index_epp) * iinode->index_esize; + + xent = kmap_atomic(page, KM_USER0) + offset; + next = xent->u.freelink[0]; + kunmap_atomic(xent, KM_USER0); + + /* done */ + *_page = page; + *_newentry = newentry; + *_next = next; + + _leave(" = 0 [{%lu},%u,%u]", page->index, newentry, next); + return 0; + + error2: + cachefs_put_page(page); + error: + _leave(" = %d", ret); + return ret; + +} /* end cachefs_index_select_free_entry() */ + +/*****************************************************************************/ +/* + * allocate an entry in the specified index file and associate an inode with it + * - target->cookie->def determines whether the new inode will be a file or an + * index + * - if an inode is successfully allocated *_newino will be set with the inode + * number + */ +int cachefs_index_add(struct cachefs_inode *index, + struct cachefs_cookie *cookie, + unsigned *_newino) +{ + struct cachefs_ondisc_index_entry *xent; + struct cachefs_ondisc_ujnl_index *jindex; + struct cachefs_ondisc_metadata *metadata; + struct cachefs_search_result *srch; + struct cachefs_transaction *trans; + struct cachefs_super *super; + struct page *inopage, *ixpage; + unsigned ino, ixentry, offset, inonext, ixnext, ino_offset; + int ret, loop; + + _enter("{%lu},{%s},", + index->vfs_inode.i_ino, index->cookie->idef->name); + + *_newino = 0; + + super = index->vfs_inode.i_sb->s_fs_info; + inopage = NULL; + ixpage = NULL; + trans = NULL; + + /* reserve the next free entry in the parent index */ + ret = cachefs_index_select_free_entry(index, + &ixpage, &ixentry, &ixnext); + if (ret < 0) + goto error; + + offset = (ixentry % index->index_epp) * index->index_esize; + + /* reserve the next free entry in the inode metadata index */ + ret = cachefs_index_select_free_entry(super->imetadata, + &inopage, &ino, &inonext); + if (ret < 0) + goto error; + + ino_offset = ino % super->imetadata->index_epp; + ino_offset <<= super->layout->metadata_bits; + + _debug("entry %u ino %u", ixentry, ino); + + /* create a transaction to record the addition */ + ret = -ENOMEM; + trans = cachefs_trans_alloc(super, GFP_KERNEL); + if (!trans) + goto error; + + trans->jentry->mark = CACHEFS_ONDISC_UJNL_INDEX_CREATING; + if (!cookie->idef) + trans->jentry->mark = CACHEFS_ONDISC_UJNL_INODE_CREATING; + + trans->jentry->index = index->vfs_inode.i_ino; + trans->jentry->ixentry = ixentry; + trans->jentry->ino = ino; + trans->jentry->size = i_size_read(&index->vfs_inode); + trans->jentry->pgnum = ixpage->index; + trans->jentry->block = __cachefs_get_page_block(ixpage)->bix; + trans->jentry->entry = offset; + trans->jentry->count = index->index_dsize; + trans->jentry->auxblock = __cachefs_get_page_block(inopage)->bix; + trans->jentry->auxentry = ino_offset; + trans->jentry->upblock = index->metadata->bix; + trans->jentry->upentry = index->metadata_offset; + + cachefs_trans_affects_page(trans, cachefs_page_grab_private(ixpage), + offset, index->index_esize); + cachefs_trans_affects_page(trans, cachefs_page_grab_private(inopage), + ino_offset, super->layout->metadata_size); + + cachefs_trans_affects_inode(trans, index); + cachefs_trans_affects_inode(trans, super->imetadata); + + /* also store in the journal information about the index modifications + * we're going to make, including the netfs's search keys and other + * data */ + jindex = &trans->jentry->u.ixdata[0]; + jindex->next_ino = inonext; + jindex->next_index = ixnext; + + index->cookie->idef->update(cookie->netfs_data, jindex->data); + + /* if we're adding a new index, we store its definition in the journal + * too */ + if (cookie->idef) { + struct cachefs_index_def *definition = cookie->idef; + + jindex->def.dsize = definition->data_size; + jindex->def.esize = definition->data_size; + jindex->def.esize += + sizeof(struct cachefs_ondisc_index_entry); + + if (jindex->def.esize < CACHEFS_ONDISC_INDEX_ENTRY_MINSIZE) + jindex->def.esize = CACHEFS_ONDISC_INDEX_ENTRY_MINSIZE; + + for (loop = 0; loop < 4; loop++) { + jindex->def.keys[loop] = + definition->keys[loop].len & + CACHEFS_ONDISC_INDEXKEY_KLEN; + jindex->def.keys[loop] |= + definition->keys[loop].type << 12; + } + + strncpy(jindex->def.type, + definition->name, + sizeof(jindex->def.type)); + } + + /* record the transaction in the journal */ + ret = cachefs_trans_mark(trans); + if (ret < 0) + goto error; + + /* we can now make the changes in the page cache */ + cachefs_page_modify(super, &ixpage); + cachefs_page_modify(super, &inopage); + + /* fill the index entry */ + xent = kmap_atomic(ixpage, KM_USER0) + offset; + xent->state = CACHEFS_ONDISC_INDEX_ACTIVE; + xent->ino = ino; + xent->type = CACHEFS_ONDISC_INDEX_DATAFILE; + + if (cookie->idef) + xent->type = CACHEFS_ONDISC_INDEX_INDEXFILE; + + memcpy(xent->u.data, jindex->data, index->index_dsize); + + kunmap_atomic(xent, KM_USER0); + + /* modify the index inode metadata entry */ + metadata = cachefs_metadata_prewrite(index); + metadata->freelink = ixnext; + metadata->atime = CURRENT_TIME.tv_sec; + cachefs_metadata_postwrite(index, metadata); + + /* fill the inode definition */ + metadata = kmap_atomic(inopage, KM_USER0) + ino_offset; + memset(metadata, 0, super->imetadata->index_esize); + + metadata->header.state = CACHEFS_ONDISC_INDEX_ACTIVE; + metadata->header.ino = 0xfefefe; + metadata->size = 0; + metadata->freelink = UINT_MAX; + metadata->mtime = CURRENT_TIME.tv_sec; + metadata->atime = CURRENT_TIME.tv_sec; + metadata->pindex = index->vfs_inode.i_ino; + metadata->pindex_entry = ixentry; + + metadata->index = jindex->def; + + kunmap_atomic(metadata, KM_USER0); + + /* modify the metadata inode metadata entry */ + metadata = cachefs_metadata_prewrite(super->imetadata); + metadata->freelink = inonext; + metadata->atime = CURRENT_TIME.tv_sec; + cachefs_metadata_postwrite(super->imetadata, metadata); + + /* commit the changes to disc */ + cachefs_trans_commit(trans); + trans = NULL; + + /* add the new inode to the cookie's list of search results */ + list_for_each_entry(srch, &cookie->search_results, link) { + if (srch->super == super) { + srch->ino = ino; + break; + } + } + + *_newino = ino; + + error: + cachefs_trans_put(trans); + cachefs_put_page(inopage); + cachefs_put_page(ixpage); + + _leave(" = %d", ret); + return ret; + +} /* end cachefs_index_add() */ + +/*****************************************************************************/ +/* + * update the index entry for an index or data file from the associated netfs + * data + */ +int cachefs_index_update(struct cachefs_inode *inode) +{ + struct cachefs_ondisc_index_entry *xent; + struct cachefs_ondisc_metadata *meta; + struct cachefs_cookie *cookie = inode->cookie; + struct cachefs_super *super; + struct cachefs_inode *index; + struct cachefs_block *block; + struct page *ixpage; + unsigned offs; + int ret; + + _enter(""); + + super = inode->vfs_inode.i_sb->s_fs_info; + + if (test_bit(CACHEFS_SUPER_WITHDRAWN, &super->flags)) + return 0; + + /* the index entry for this inode lives in the parent index inode */ + list_for_each_entry(index, + &cookie->iparent->backing_inodes, + cookie_link) { + if (index->vfs_inode.i_sb == inode->vfs_inode.i_sb) + goto found_parent_index_inode; + } + + /* hmmm... the parent inode is strangely absent */ + BUG(); + return -ENOENT; + + found_parent_index_inode: + /* find the entry number of this inode's index entry */ + meta = cachefs_metadata_preread(inode); + offs = meta->pindex_entry; + cachefs_metadata_postread(inode, meta); + + /* get the page holding the index data */ + ixpage = cachefs_get_page(index, offs / index->index_epp); + if (IS_ERR(ixpage)) { + _leave(" = %ld", PTR_ERR(ixpage)); + return PTR_ERR(ixpage); + } + + offs = (offs % index->index_epp) * index->index_esize; + + _debug("update ino=%lx pg={%lu}+%x", + index->vfs_inode.i_ino, ixpage->index, offs); + + /* we just alter the index entry directly without journalling the + * change - if what's on disc winds up obsolete because someone trips + * over the power cable, the netfs will ask for the entry to be deleted + * later. We do, however, let the journal writer write the block for us + */ + block = __cachefs_get_page_block(ixpage); + + ret = cachefs_block_begin_alter(block); + if (ret < 0) + goto error_page; + + /* we may now need to look at a different page as the old one may have + * been C-O-W'd */ + cachefs_block_modify(super, block, &ixpage); + + /* get the netfs to make the change */ + xent = kmap_atomic(ixpage, KM_USER0) + offs; + cookie->iparent->idef->update(cookie->netfs_data, xent->u.data); + kunmap_atomic(xent, KM_USER0); + + cachefs_block_end_alter(block); + + error_page: + cachefs_put_page(ixpage); + _leave(" = %d", ret); + return ret; + +} /* end cachefs_index_update() */ + +/*****************************************************************************/ +/* + * mark as obsolete the next inode pinned by an entry in the index currently + * being reclaimed + * - called from kcachefsd + */ +int cachefs_index_reclaim_one_entry(struct cachefs_super *super, + struct cachefs_transaction **_trans) +{ + struct cachefs_ondisc_index_entry *xent; + struct cachefs_ondisc_metadata *metadata; + struct cachefs_transaction *trans; + struct cachefs_inode *inode = NULL; + unsigned long flags; + struct page *page = NULL; + unsigned pgnum, offset, ino; + int ret; + + _enter("{%x,%x}", super->rcm_ino, super->rcm_block); + + try_next_block: + /* find the next block we're going to scan */ + pgnum = super->rcm_block / super->rcm_inode->index_epp; + offset = super->rcm_block % super->rcm_inode->index_epp; + offset *= super->rcm_inode->index_esize; + + if (pgnum >= (i_size_read(&super->rcm_inode->vfs_inode) >> PAGE_SHIFT)) { + /* we've done this index entirely */ + _leave(" = 0"); + return 0; + } + + /* get the page holding the next index entry and extract the inode + * number from it */ + page = cachefs_get_page(super->rcm_inode, pgnum); + if (IS_ERR(page)) { + if (PTR_ERR(page) == -EIO) { + /* forget about this block - it's buggy */ + super->rcm_block = + (pgnum + 1) * super->rcm_inode->index_epp; + } + + _leave(" = %ld", PTR_ERR(page)); + return PTR_ERR(page); + } + + try_next_entry: + xent = kmap_atomic(page, KM_USER0) + offset; + ino = xent->ino; + BUG_ON(ino == 0 && xent->state != CACHEFS_ONDISC_INDEX_FREE); + BUG_ON(ino != 0 && xent->state == CACHEFS_ONDISC_INDEX_FREE); + kunmap_atomic(xent, KM_USER0); + + if (!ino) { + _debug("skip slot %u", super->rcm_block); + super->rcm_block++; + + offset += super->rcm_inode->index_esize; + if (offset + super->rcm_inode->index_esize <= PAGE_SIZE) + goto try_next_entry; + + cachefs_put_page(page); + page = NULL; + goto try_next_block; + } + + inode = cachefs_iget(super, ino); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + if (ret == -EIO) + super->rcm_block++; + goto error_noinode; + } + + /* use the pre-created a transaction to record the change of state */ + trans = *_trans; + + trans->jentry->mark = CACHEFS_ONDISC_UJNL_INODE_MARK_RECLAIM; + trans->jentry->ino = inode->vfs_inode.i_ino; + trans->jentry->index = super->rcm_ino; + trans->jentry->ixentry = super->rcm_block; + trans->jentry->pgnum = page->index; + trans->jentry->block = __cachefs_get_page_block(page)->bix; + trans->jentry->entry = offset; + trans->jentry->auxblock = inode->metadata->bix; + trans->jentry->auxentry = inode->metadata_offset; + + cachefs_trans_affects_inode(trans, inode); + + trans->jentry->rcm_block = super->rcm_block + 1; + + /* record the transaction in the journal */ + ret = cachefs_trans_mark(trans); + if (ret < 0) + goto error; + + *_trans = NULL; + + /* modify the inode metadata entry */ + metadata = cachefs_metadata_prewrite(inode); + metadata->header.state = CACHEFS_ONDISC_INDEX_RECYCLE; + metadata->pindex = 0; + metadata->pindex_entry = 0; + cachefs_metadata_postwrite(inode, metadata); + + /* commit the changes to disc */ + cachefs_trans_commit(trans); + + /* attempt to schedule the inode we've just marked for immediate + * reclamation */ + spin_lock_irqsave(&super->rcm_lock, flags); + + if (CIRC_SPACE(super->rcm_imm_head, + super->rcm_imm_tail, + CACHEFS_RCM_IMM_BUFSIZE) > 0 + ) { + super->rcm_imm_buf[super->rcm_imm_head] = + inode->vfs_inode.i_ino; + super->rcm_imm_head = + (super->rcm_imm_head + 1) & + (CACHEFS_RCM_IMM_BUFSIZE - 1); + } + else { + set_bit(CACHEFS_SUPER_RCM_IMM_SCAN, &super->flags); + } + + spin_unlock_irqrestore(&super->rcm_lock, flags); + + /* there may be more to do on this index */ + ret = -EAGAIN; + + error: + cachefs_iput(inode); + error_noinode: + cachefs_put_page(page); + + _leave(" = %d [%u]", ret, super->rcm_block); + return ret; + +} /* end cachefs_index_reclaim_one_entry() */ diff -puN /dev/null fs/cachefs/indirection-io.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25-akpm/fs/cachefs/indirection-io.c 2005-06-26 13:42:55.000000000 -0700 @@ -0,0 +1,833 @@ +/* indirection-io.c: indirection-tree based files I/O operations + * + * Indirection tree based files comprise most of the files in cachefs; + * they can have blocks scattered all over the place, and to find them + * block pointers and indirection blocks are used. These are arranged + * in prototypical UNIX fashion with deeper levels of indirection the + * further into a file a block is. All data cache files and index + * files are in this form. + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Derived from ext2 equivalents + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cachefs-int.h" + +struct cachefs_io_block_path { + struct page *page; + struct cachefs_page *pageio; /* page => block mapping */ + cachefs_blockix_t bix; /* block number for this level */ + unsigned offset; /* offset into parent pointer block */ + + unsigned flags; +#define CACHEFS_BLOCK_IS_INODE 0x00000001 +#define CACHEFS_BLOCK_NEW 0x00000002 +#define CACHEFS_BLOCK_WRITTEN 0x00000004 +#define CACHEFS_BLOCK_INIT_INDIRECT 0x00000008 +#define CACHEFS_BLOCK_INIT_INDEX 0x00000010 +#define CACHEFS_BLOCK_INIT_NETFSDATA 0x00000020 + + /* ujournal marks for allocation journalling entries */ + enum cachefs_ondisc_ujnl_mark mktype : 8; + u8 auxmark; + struct cachefs_transaction *transaction; +}; + +static int cachefs_indr_io_readpage(struct file *file, struct page *page); +static int cachefs_indr_io_readpages(struct file *file, + struct address_space *mapping, + struct list_head *pages, + unsigned nr_pages); + +struct address_space_operations cachefs_indr_io_addrspace_operations = { + .readpage = cachefs_indr_io_readpage, + .readpages = cachefs_indr_io_readpages, + .writepage = cachefs_no_writepage, + .writepages = cachefs_no_writepages, + .prepare_write = cachefs_no_prepare_write, + .commit_write = cachefs_no_commit_write, + .set_page_dirty = cachefs_no_set_page_dirty, + .sync_page = cachefs_sync_page, + .invalidatepage = cachefs_invalidatepage, + .releasepage = cachefs_releasepage, +}; + +/*****************************************************************************/ +/* + * set up to read a page from disc + * - we try to amalgamate reads to consecutive pages + * - modelled on the stuff in fs/buffer.c + */ +static int cachefs_indr_io_do_readpage(struct bio **_bio, + struct page *page, + unsigned nr_pages, + sector_t *last_block_in_bio) +{ + struct cachefs_page *pageio; + struct inode *inode = page->mapping->host; + sector_t last_block; + int ret; + + _enter(""); + + /* get the page mapping cookie */ + pageio = cachefs_page_get_private(page, GFP_KERNEL); + if (IS_ERR(pageio)) { + ret = PTR_ERR(pageio); + goto error; + } + + /* check we aren't trying to go beyond the end of the file */ + last_block = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (page->index >= last_block) + goto hole; + + /* follow the on-disc block pointer indirection chain */ + if (inode->i_ino != CACHEFS_INO_METADATA || page->index != 0) { + ret = cachefs_indr_io_get_block(inode, page, pageio, 0); + if (ret<0) + goto error; + } + else { + /* the first block of the metadata file holds its own metadata, + * so we can't follow the chain there */ + ret = cachefs_block_set2(inode->i_sb->s_fs_info, 1, page, + pageio, NULL); + if (ret < 0) + goto error; + } + + /* handle a hole */ + if (!pageio->mapped_block) + goto hole; + + /* we need to add the page we're looking at to a BIO... if there's no + * current BIO, or the page is not contiguous with the current BIO's + * contents, then we need to start a new BIO + */ + if (!*_bio) + goto allocate_new_bio; + else if (*last_block_in_bio + 1 != pageio->mapped_block->bix) + goto dispatch_bio; + + /* add the page to the current BIO */ + add_page: + if (!bio_add_page(*_bio, page, PAGE_SIZE, 0)) + goto dispatch_bio; /* current BIO was full */ + + /* dispatch the BIO immediately if the current page lives on an + * indirection chain boundary */ + if (test_bit(CACHEFS_PAGE_BOUNDARY, &pageio->flags)) { + submit_bio(READ, *_bio); + *_bio = NULL; + } + else { + *last_block_in_bio = pageio->mapped_block->bix; + } + + _leave(" = 0"); + return 0; + + /* dispatch the current BIO and allocate a new one */ + dispatch_bio: + submit_bio(READ, *_bio); + allocate_new_bio: + ret = cachefs_io_alloc(inode->i_sb, + pageio->mapped_block->bix, + nr_pages, GFP_KERNEL, _bio); + if (ret < 0) { + *_bio = NULL; + goto error; + } + goto add_page; + + /* deal with a hole in the on-disc file + * - in a data cache file that represents an unfetched block + * - in an index file that's an error + */ + hole: + ret = -ENODATA; + if (test_bit(CACHEFS_ACTIVE_INODE_ISINDEX, + &CACHEFS_FS_I(inode)->flags)) { + printk("CacheFS: found unexpected hole in index/metadata file:" + " ino=%lu pg=%lu\n", + inode->i_ino, page->index); + ret = -EIO; + } + + error: + if (*_bio) { + submit_bio(READ, *_bio); + *_bio = NULL; + } + unlock_page(page); + + _leave("= %d", ret); + return ret; +} /* end cachefs_indr_io_do_readpage() */ + +/*****************************************************************************/ +/* + * read a bunch of pages from disc + */ +int cachefs_indr_io_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + sector_t last_block_in_bio = 0; + struct pagevec lru_pvec; + struct bio *bio = NULL; + unsigned page_idx; + int ret; + + _enter(",,%u", nr_pages); + + ret = 0; + pagevec_init(&lru_pvec, 0); + + /* read all the pages, merging requests where possible */ + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + if (!add_to_page_cache(page, mapping, page->index, + GFP_KERNEL)) { + ret = cachefs_indr_io_do_readpage(&bio, + page, + nr_pages - page_idx, + &last_block_in_bio); + if (ret < 0) + break; + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + } else { + page_cache_release(page); + } + } + + /* dispatch any left over BIO */ + if (bio) + submit_bio(READ, bio); + + /* add the pages to the LRU queue */ + pagevec_lru_add(&lru_pvec); + BUG_ON(!list_empty(pages)); + + _leave(" = %d", ret); + return ret; + +} /* end cachefs_indr_io_readpages() */ + +/*****************************************************************************/ +/* + * read a single page from disc + */ +int cachefs_indr_io_readpage(struct file *file, struct page *page) +{ + struct bio *bio = NULL; + sector_t last_block_in_bio = 0; + int ret; + + _enter("{%lu}", page->index); + + ret = cachefs_indr_io_do_readpage(&bio, page, 1, &last_block_in_bio); + if (bio) + submit_bio(READ, bio); + + _leave(" = %d", ret); + return ret; + +} /* end cachefs_indr_io_readpage() */ + +/*****************************************************************************/ +/* + * allocate a block + * - journal mark is preallocated and pointed to by step->mark + */ +static int cachefs_indr_io_get_block_alloc(struct super_block *sb, + struct cachefs_inode *inode, + struct cachefs_io_block_path *step) +{ + struct cachefs_ondisc_update_journal *jentry; + struct cachefs_ondisc_free_node *node; + struct cachefs_ondisc_metadata *metadata; + struct cachefs_super *super = sb->s_fs_info; + struct cachefs_block *block; + cachefs_blockix_t alloc2os = 0; + uint32_t next_count = 0; + int ret; + u8 *data; + + DECLARE_WAITQUEUE(myself, current); + + _enter(",,{pg=%p}", step->page); + + jentry = step->transaction->jentry; + + lock_page(step[1].page); + + /* do all the allocation first */ + ret = -ENOMEM; + + BUG_ON(!step[1].pageio); + BUG_ON(!step[1].pageio->mapped_block); + + cachefs_trans_affects_page(step->transaction, + step[1].pageio, + step->offset, + sizeof(cachefs_blockix_t)); + + /* index content data blocks need to be initialised on disc */ + if (step->flags & CACHEFS_BLOCK_INIT_INDEX) { + _debug("init index"); + + if (!(step[1].flags & CACHEFS_BLOCK_IS_INODE)) + cachefs_trans_affects_inode(step->transaction, inode); + + jentry->count = inode->index_esize; + jentry->ixentry = step->page->index * inode->index_epp; + + metadata = cachefs_metadata_preread(inode); + jentry->index = metadata->freelink; + cachefs_metadata_postread(inode, metadata); + } + + /* freshly allocated data blocks must be recorded in the v-journal */ + if (step->flags & CACHEFS_BLOCK_INIT_NETFSDATA) { + _debug("init data"); + + ret = cachefs_vj_alloc(step->transaction, inode); + if (ret<0) + goto error_trans; + + step->transaction->vjentry->pgnum = step->page->index; + step->transaction->vjentry->upblock = step[1].bix; + step->transaction->vjentry->upentry = step->offset; + + jentry->auxmark = step->transaction->vjentry->vslot; + } + + /* wait for a node to become available in the allocation stack */ + down(&super->alloc_sem); + + if (!super->alloc_node) { + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&super->alloc_wq, &myself); + + while (!super->alloc_node && !signal_pending(current)) { + wake_up(&super->dmn_sleepq); + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&super->alloc_wq, &myself); + + ret = -EINTR; + if (signal_pending(current)) + goto error_sem; + } + + _debug("use leaf %u/%lu", + super->alloc_leaf, CACHEFS_ONDISC_LEAVES_PER_FREE_NODE); + + BUG_ON(super->alloc_leaf > CACHEFS_ONDISC_LEAVES_PER_FREE_NODE); + + step->transaction->changed |= CACHEFS_TRANS_CHANGED_ALLOC; + + /* choose either a dependent block or the now empty node */ + if (super->alloc_leaf == CACHEFS_ONDISC_LEAVES_PER_FREE_NODE) { + /* no dependent blocks left - take the alloc node itself */ + block = super->alloc_block; + BUG_ON(!block); + + jentry->block = super->alloc_cur; + BUG_ON(!jentry->block); + + node = kmap_atomic(super->alloc_node, KM_USER0); + jentry->alloc_cur = node->next; + jentry->alloc_leaf = 0; + next_count = node->count; + kunmap_atomic(node, KM_USER0); + + alloc2os = jentry->alloc_cur; + + if (step->page) + cachefs_block_set(super, + block, + step->page, + step->pageio); + } + else { + /* take the next dependent page */ + node = kmap_atomic(super->alloc_node, KM_USER0); + jentry->block = node->leaves[super->alloc_leaf]; + alloc2os = node->next; + kunmap_atomic(node, KM_USER0); + BUG_ON(!jentry->block); + + jentry->alloc_cur = super->alloc_cur; + jentry->alloc_leaf = super->alloc_leaf + 1; + + if (!step->page) { + ret = cachefs_block_read(super, NULL, jentry->block, 1, + &block, &step->page); + if (ret < 0) + goto error_block; + step->pageio = cachefs_page_grab_private(step->page); + } + else { + ret = cachefs_block_set2(super, jentry->block, + step->page, step->pageio, + &block); + if (ret < 0) + goto error_block; + } + } + + if (step->flags & + (CACHEFS_BLOCK_INIT_INDEX | CACHEFS_BLOCK_INIT_INDIRECT)) + cachefs_trans_affects_block(step->transaction, block, 0, + PAGE_SIZE); + + jentry->auxblock = inode->metadata->bix; + jentry->auxentry = inode->metadata_offset; + jentry->size = i_size_read(&inode->vfs_inode); + + _debug("selected block %u", jentry->block); + + BUG_ON(jentry->block > super->layout->bix_end); + + /* start 2OS block loading if we're near the end of the TOS block */ + if (alloc2os && + super->alloc_leaf >= CACHEFS_ONDISC_LEAVES_PER_FREE_NODE - 30 && + !super->alloc_next + ) { + _debug("prepare 2OS %u", alloc2os); + + ret = cachefs_block_read(super, NULL, alloc2os, 0, + &super->alloc_nxblock, + &super->alloc_next); + if (ret == 0) + set_bit(CACHEFS_BLOCK_CRITICAL, + &super->alloc_nxblock->flags); + else + printk("CacheFS: can't read 2OS of alloc stack: %d\n", + ret); + } + + /* make sure the journal is marked on disc before doing anything else */ + if (cachefs_trans_mark(step->transaction) < 0) + goto error_block; + + if (step->flags & CACHEFS_BLOCK_INIT_NETFSDATA) { + set_bit(CACHEFS_BLOCK_NETFSDATA, &block->flags); + block->vjentry = step->transaction->vjentry; + block->vjentry->bix = block->bix; + } + + /* index and indirection blocks need to be initialised before use */ + if (step->flags & (CACHEFS_BLOCK_INIT_INDIRECT | + CACHEFS_BLOCK_INIT_INDEX) + ) { + cachefs_block_modify(super, block, &step->page); + + if (step->flags & CACHEFS_BLOCK_INIT_INDIRECT) { + clear_highpage(step->page); + } + else { + struct cachefs_ondisc_index_entry *xent; + uint32_t entry, next; + void *content; + int loop; + + next = jentry->index; + entry = jentry->ixentry; + + content = kmap_atomic(step->page, KM_USER0); + clear_page(content); + + for (loop = inode->index_epp - 1; loop >= 0; loop--) { + xent = content + loop * jentry->count; + xent->state = CACHEFS_ONDISC_INDEX_FREE; + xent->u.freelink[0] = next; + next = entry + loop; + } + + kunmap_atomic(content, KM_USER0); + + _debug("new freelink: %u", jentry->ixentry); + } + } + + /* clean up the alloc stack tracking */ + if (super->alloc_leaf == 0) { + struct page *dead; + + /* move the allocation stack to the 2OS */ + dead = super->alloc_node; + + super->alloc_cur_n = next_count; + super->alloc_node = super->alloc_next; + super->alloc_block = super->alloc_nxblock; + super->alloc_next = NULL; + super->alloc_nxblock = NULL; + dbgpgfree(dead); + page_cache_release(dead); + } + + super->alloc_cur_n--; + + up(&super->alloc_sem); + + /* set the appropriate pointer on disc to point to this block */ + step->bix = jentry->block; + + if (!(step[1].flags & CACHEFS_BLOCK_IS_INODE)) { + cachefs_page_modify(super, &step[1].page); + + data = kmap_atomic(step[1].page, KM_USER0); + *(cachefs_blockix_t *)(data + step->offset) = step->bix; + kunmap_atomic(data, KM_USER0); + } + + metadata = cachefs_metadata_prewrite(inode); + metadata->size = i_size_read(&inode->vfs_inode); + metadata->mtime = CURRENT_TIME.tv_sec; + + if (step->flags & CACHEFS_BLOCK_INIT_INDEX) { + metadata->freelink = jentry->ixentry; + } + + if (step[1].flags & CACHEFS_BLOCK_IS_INODE) { + unsigned long pageaddr = (unsigned long) metadata & PAGE_MASK; + *(cachefs_blockix_t *)(pageaddr + step->offset) = step->bix; + } + + cachefs_metadata_postwrite(inode, metadata); + + unlock_page(step[1].page); + + /* okay... done that */ + cachefs_trans_commit(step->transaction); + step->transaction = NULL; + + /* the allocation must be journalled before journalling-independent + * writes are permitted to modify a reused metadata block that had + * critical data on it + */ + if ((step->flags & CACHEFS_BLOCK_INIT_NETFSDATA) && + test_bit(CACHEFS_BLOCK_CRITICAL, &block->flags) + ) { + cachefs_trans_sync(super, CACHEFS_TRANS_SYNC_WAIT_FOR_MARK); + clear_bit(CACHEFS_BLOCK_CRITICAL, &block->flags); + } + + cachefs_block_put(block); + block = NULL; + + _leave(" = 0 [block %u]", step->bix); + return 0; + + error_block: + cachefs_block_put(block); + block = NULL; + error_sem: + up(&super->alloc_sem); + error_trans: + cachefs_trans_put(step->transaction); + step->transaction = NULL; + unlock_page(step[1].page); + _leave(" = %d", ret); + return ret; + +} /* end cachefs_indr_io_get_block_alloc() */ + +/*****************************************************************************/ +/* + * map a block in a file to a block within the block device + * - the inode meta-data contains: + * - ~120 direct pointers for the first part of the file + * - 1 single-indirect pointer for the first indirection block (1024 ptrs) + * - 1 double-indirect pointer for the remainder of the file + * and must be included in the final journal mark + * - returns: + * - 0 if successful and the block details are set in result + * - -ENODATA if no block at that index + * - sets CACHEFS_PAGE_BOUNDARY if the next block has a different indirection + * chain + * - if the inode forms part of an index, then the any blocks belong to that + * index and must be initialised as part of the final journalling mark + */ +int cachefs_indr_io_get_block(struct inode *vfs_inode, struct page *page, + struct cachefs_page *pageio, int create) +{ + struct cachefs_io_block_path path[4]; + struct cachefs_inode *inode = CACHEFS_FS_I(vfs_inode); + struct cachefs_super *super = inode->vfs_inode.i_sb->s_fs_info; + const size_t ptrperblk = PAGE_SIZE / sizeof(cachefs_blockix_t); + sector_t iblock; + size_t ptrqty, notboundary = 1; + int pix, ret; + + _enter("%lu,{%p}%lu,,%d", + inode->vfs_inode.i_ino, page, page->index, create); + + BUG_ON(pageio->mapped_block); + + if (page->index / ptrperblk >= ptrperblk) { + _leave(" = -EIO [range]"); + return -EIO; + } + + memset(path, 0, sizeof(path)); + path[2].mktype = CACHEFS_ONDISC_UJNL_INDIRECT_ALLOCING; + path[1].mktype = CACHEFS_ONDISC_UJNL_INDIRECT_ALLOCING; + path[0].mktype = CACHEFS_ONDISC_UJNL_DATA_ALLOCING; + path[0].flags = CACHEFS_BLOCK_INIT_NETFSDATA; + + if (inode->index_esize) { + path[0].mktype = CACHEFS_ONDISC_UJNL_INDEX_EXTENDING; + path[0].flags = CACHEFS_BLOCK_INIT_INDEX; + } + + path[0].page = page; + path[0].pageio = pageio; + + /* is it inside direct range? */ + iblock = page->index; + ptrqty = super->sb->s_blocksize; + ptrqty -= sizeof(struct cachefs_ondisc_metadata); + ptrqty /= sizeof(cachefs_blockix_t); + if (iblock < ptrqty) { + _debug("direct (%llu/%u)", iblock, ptrqty); + notboundary = ptrqty - iblock + 1; + + path[0].offset = iblock * sizeof(cachefs_blockix_t); + path[0].offset += offsetof(struct cachefs_ondisc_metadata, + direct); + path[1].flags = CACHEFS_BLOCK_IS_INODE; + path[1].page = inode->metadata_page; + pix = 0; + goto process; + } + iblock -= ptrqty; + + /* is it inside single-indirect range? */ + ptrqty = ptrperblk; + if (iblock < ptrqty) { + _debug("indirect (%llu/%u)", iblock, ptrqty); + notboundary = (iblock + 1) & (ptrperblk - 1); + + path[0].offset = iblock * sizeof(cachefs_blockix_t); + path[1].flags = CACHEFS_BLOCK_INIT_INDIRECT; + path[1].offset = offsetof(struct cachefs_ondisc_metadata, + single_indirect); + path[1].auxmark = CACHEFS_ONDISC_UJNL_SINGLE_0; + path[2].flags = CACHEFS_BLOCK_IS_INODE; + path[2].page = inode->metadata_page; + pix = 1; + goto process; + } + iblock -= ptrqty; + + /* is it inside double-indirect range? */ + ptrqty *= ptrqty; + if (iblock < ptrqty) { + _debug("double indirect (%llu/%u)", iblock, ptrqty); + notboundary = (iblock + 1) & (ptrperblk - 1); + + path[0].offset = + sector_div(iblock, + PAGE_SIZE / sizeof(cachefs_blockix_t)); + path[0].offset *= sizeof(cachefs_blockix_t); + path[1].flags = CACHEFS_BLOCK_INIT_INDIRECT; + path[1].offset = iblock * sizeof(cachefs_blockix_t); + path[1].auxmark = CACHEFS_ONDISC_UJNL_DOUBLE_1; + path[2].flags = CACHEFS_BLOCK_INIT_INDIRECT; + path[2].offset = offsetof(struct cachefs_ondisc_metadata, + double_indirect); + path[2].auxmark = CACHEFS_ONDISC_UJNL_DOUBLE_0; + path[3].flags = CACHEFS_BLOCK_IS_INODE; + path[3].page = inode->metadata_page; + pix = 2; + goto process; + } + + /* it seems to be inside triple-indirect range, which isn't supported + * yet (TODO) */ + BUG(); + pix = 3; + + /* walk the path, filling in missing steps if required */ + process: + dbgpgalloc(path[pix + 1].page); + page_cache_get(path[pix + 1].page); + + path[pix].offset += inode->metadata_offset; + + down_read(&inode->metadata_sem); + path[pix + 1].pageio = cachefs_page_grab_private(inode->metadata_page); + up_read(&inode->metadata_sem); + + path[pix + 1].bix = path[pix + 1].pageio->mapped_block->bix; + + ret = 0; + for (; pix >= 0; pix--) { + struct cachefs_io_block_path *step = &path[pix]; + + _debug("step level %u { ptr={%lu}+%u / bix=%u }", + pix, step[1].page->index, step->offset, step[1].bix); + + /* get the block number for this level */ + if (!step->bix) { + u8 *data = kmap_atomic(step[1].page, KM_USER0); + step->bix = + *(cachefs_blockix_t *)(data + step->offset); + kunmap_atomic(data, KM_USER0); + } + + /* allocate this block if necessary */ + if (!step->bix) { + struct cachefs_ondisc_update_journal *jentry; + + if (!create) { + _debug("path incomplete at level %d", pix); + ret = -ENODATA; + break; + } + + _debug("need to allocate level %d block", pix); + + step->transaction = + cachefs_trans_alloc( + inode->vfs_inode.i_sb->s_fs_info, + GFP_NOFS); + + ret = -ENOMEM; + if (!step->transaction) + break; + + jentry = step->transaction->jentry; + + jentry->ino = inode->vfs_inode.i_ino; + jentry->pgnum = page->index; + jentry->mark = step->mktype; + jentry->auxmark = step->auxmark; + jentry->upblock = + __cachefs_get_page_block(step[1].page)->bix; + jentry->upentry = step->offset; + + ret = cachefs_indr_io_get_block_alloc( + inode->vfs_inode.i_sb, inode, step); + if (ret < 0) + break; + step->flags |= CACHEFS_BLOCK_NEW; + } + else if (step->page) { + ret = cachefs_block_set2(super, step->bix, step->page, + step->pageio, NULL); + if (ret < 0) + break; + } + + /* if we're at the leaf, we don't need to actually access the + * block */ + if (pix <= 0) + continue; + + /* initiate or read the this block as appropriate */ + if (!step->page) { + if (step->flags & CACHEFS_BLOCK_NEW) { + _debug("getting level %d block %u", + pix, step->bix); + } + else { + _debug("reading level %d block %u", + pix, step->bix); + } + + ret = cachefs_block_read( + super, NULL, step->bix, + step->flags & CACHEFS_BLOCK_NEW, + NULL, &step->page); + if (ret < 0) { + printk("CacheFS: " + "read I/O error on level %d block %u:" + " %d\n", + pix, step->bix, ret); + break; + } + + wait_on_page_locked(step->page); + } + + if (!step->pageio) { + step->pageio = __cachefs_page_grab_private(step->page); + if (!step->pageio) { + printk("step level %u" + " { ptr={%lu}+%u / bix=%u }", + pix, step[1].page->index, + step->offset, step[1].bix); + printk("mk=%u aux=%u flags=%x", + step->mktype, + step->auxmark, + step->flags); + BUG(); + } + } + } + + /* release the pages used to walk the path */ + for (pix = sizeof(path) / sizeof(path[0]) - 1; pix > 0; pix--) + if (path[pix].page) { + dbgpgfree(path[pix].page); + page_cache_release(path[pix].page); + } + + if (ret < 0) { + cachefs_block_put(xchg(&pageio->mapped_block, NULL)); + _leave(" = %d", ret); + return ret; + } + else if (path[0].flags & CACHEFS_BLOCK_INIT_NETFSDATA) { + set_bit(CACHEFS_BLOCK_NETFSDATA, &pageio->mapped_block->flags); + } + + /* got the block - set the block offset in the page mapping record */ + if (path[0].flags & CACHEFS_BLOCK_NEW) + set_bit(CACHEFS_PAGE_NEW, &pageio->flags); + + _debug("notboundary = %u", notboundary); + if (!notboundary) + set_bit(CACHEFS_PAGE_BOUNDARY, &pageio->flags); + + _leave(" = 0 [bix=%u %c%c]", + pageio->mapped_block->bix, + test_bit(CACHEFS_PAGE_BOUNDARY, &pageio->flags) ? 'b' : '-', + test_bit(CACHEFS_PAGE_NEW, &pageio->flags) ? 'n' : '-' + ); + return 0; + +} /* end cachefs_indr_io_get_block() */ diff -puN /dev/null fs/cachefs/inode.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25-akpm/fs/cachefs/inode.c 2005-06-26 13:42:55.000000000 -0700 @@ -0,0 +1,400 @@ +/* cache-inode.c: general cache filesystem inode handling code + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cachefs-int.h" + +static int cachefs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); + +static struct inode_operations cachefs_inode_operations = { + .getattr = cachefs_inode_getattr, +}; + +static struct file_operations cachefs_file_operations = { + .read = generic_file_read, + .write = generic_file_write, +}; + +/*****************************************************************************/ +/* + * set up a status file virtual inode + */ +static void cachefs_iget_status_file(struct cachefs_inode *inode) +{ + inode->vfs_inode.i_mode = S_IFREG | S_IRUGO; + inode->vfs_inode.i_uid = 0; + inode->vfs_inode.i_gid = 0; + inode->vfs_inode.i_nlink = 1; + inode->vfs_inode.i_size = 0; + inode->vfs_inode.i_atime = CURRENT_TIME; + inode->vfs_inode.i_mtime = CURRENT_TIME; + inode->vfs_inode.i_ctime = CURRENT_TIME; + inode->vfs_inode.i_blksize = PAGE_SIZE; + inode->vfs_inode.i_blkbits = PAGE_SHIFT; + inode->vfs_inode.i_blocks = 0; + inode->vfs_inode.i_version = 1; + inode->vfs_inode.i_flags |= S_NOATIME; + inode->vfs_inode.i_op = &cachefs_status_inode_operations; + inode->vfs_inode.i_fop = &cachefs_status_file_operations; + +} /* end cachefs_iget_status_file() */ + +/*****************************************************************************/ +/* + * set up a linear file inode (such as the inode we use to represent the entire + * block device) + */ +static void cachefs_iget_linear_file(struct cachefs_inode *inode, + unsigned blocks) +{ + inode->vfs_inode.i_mode = S_IFREG | S_IRUGO; + inode->vfs_inode.i_uid = 0; + inode->vfs_inode.i_gid = 0; + inode->vfs_inode.i_nlink = 1; + inode->vfs_inode.i_size = (unsigned long) blocks << PAGE_SHIFT; + inode->vfs_inode.i_atime = CURRENT_TIME; + inode->vfs_inode.i_mtime = CURRENT_TIME; + inode->vfs_inode.i_ctime = CURRENT_TIME; + inode->vfs_inode.i_blksize = PAGE_SIZE; + inode->vfs_inode.i_blkbits = PAGE_SHIFT; + inode->vfs_inode.i_blocks = blocks; + inode->vfs_inode.i_version = 1; + inode->vfs_inode.i_flags |= S_NOATIME; + inode->vfs_inode.i_op = &cachefs_inode_operations; + inode->vfs_inode.i_fop = &cachefs_file_operations; + inode->vfs_inode.i_mapping->a_ops = + &cachefs_linear_io_addrspace_operations; + +} /* end cachefs_iget_linear_file() */ + +/*****************************************************************************/ +/* + * retrieve the inode for the meta-data file, the first block of which we know + * to reside in block 1 + */ +static int cachefs_iget_file0(struct cachefs_inode *inode) +{ + struct cachefs_ondisc_metadata *metadata; + struct cachefs_super *super; + struct page *metadata_page; + unsigned pos; + + super = inode->vfs_inode.i_sb->s_fs_info; + + _enter("{sb=%p ino=%lu}", + inode->vfs_inode.i_sb, inode->vfs_inode.i_ino); + + /* stick in some initial values so that we can read the first page into + * the page cache */ + inode->vfs_inode.i_mode = S_IFREG | S_IRUGO | S_IWUSR; + inode->vfs_inode.i_uid = 0; + inode->vfs_inode.i_gid = 0; + inode->vfs_inode.i_nlink = 1; + inode->vfs_inode.i_size = PAGE_SIZE; + inode->vfs_inode.i_blksize = PAGE_SIZE; + inode->vfs_inode.i_blkbits = PAGE_SHIFT; + inode->vfs_inode.i_blocks = 1; + inode->vfs_inode.i_version = 0; + inode->vfs_inode.i_flags |= S_NOATIME; + inode->vfs_inode.i_op = &cachefs_inode_operations; + inode->vfs_inode.i_fop = &cachefs_file_operations; + + inode->vfs_inode.i_mapping->a_ops = + &cachefs_indr_io_addrspace_operations; + + inode->index_dsize = super->layout->metadata_size; + inode->index_esize = inode->index_dsize; + inode->index_epp = PAGE_SIZE / inode->index_esize; + + __set_bit(CACHEFS_ACTIVE_INODE_ISINDEX, &inode->flags); + + /* read the block containing this inode's meta-data from disc */ + pos = inode->vfs_inode.i_ino << super->layout->metadata_bits; + + metadata_page = cachefs_get_page(inode, pos / PAGE_SIZE); + if (IS_ERR(metadata_page)) { + printk("kAFS: Failed to read meta-data page %lu: %ld\n", + pos / PAGE_SIZE, PTR_ERR(metadata_page)); + _leave(" = %ld", PTR_ERR(metadata_page)); + return PTR_ERR(metadata_page); + } + + inode->metadata_page = metadata_page; + + /* finish initialising the inode from its own contents */ + inode->metadata = __cachefs_get_page_block(inode->metadata_page); + + metadata = cachefs_metadata_preread(inode); + + _debug("read page %lu (pos %04x-%04x)", + inode->metadata_page->index, pos, pos + inode->index_esize - 1); + + inode->vfs_inode.i_size = metadata->size; + inode->vfs_inode.i_blocks = metadata->size + inode->vfs_inode.i_blksize - 1; + inode->vfs_inode.i_blocks >>= PAGE_SHIFT; + inode->vfs_inode.i_version = 1; + inode->vfs_inode.i_atime.tv_sec = metadata->mtime; + inode->vfs_inode.i_mtime.tv_sec = metadata->mtime; + inode->vfs_inode.i_ctime.tv_sec = metadata->mtime; + + inode->index_dsize = metadata->index.dsize; + inode->index_esize = metadata->index.esize; + inode->index_epp = PAGE_SIZE / metadata->index.esize; + + cachefs_metadata_postread(inode, metadata); + + inode->vfs_inode.i_atime.tv_nsec = 0; + inode->vfs_inode.i_mtime.tv_nsec = 0; + inode->vfs_inode.i_ctime.tv_nsec = 0; + + _leave(" = 0"); + return 0; + +} /* end cachefs_iget_file0() */ + +/*****************************************************************************/ +/* + * retrieve the inode attributes for the Nth file from disc + * - this resides in the metadata inode + */ +static int cachefs_iget_fileN(struct cachefs_inode *inode) +{ + struct cachefs_ondisc_metadata *metadata; + struct cachefs_super *super; + struct cachefs_inode *imetadata; + struct page *metadata_page; + unsigned pos; + + super = inode->vfs_inode.i_sb->s_fs_info; + + _enter("{s=%p ino=%lu}", super, inode->vfs_inode.i_ino); + + /* get the meta-file inode */ + imetadata = cachefs_igrab(super->imetadata); + if (!imetadata) { + _leave(" = -EIO"); + return -EIO; + } + + /* read the page containing this inode's meta-data */ + pos = inode->vfs_inode.i_ino * imetadata->index_esize; + metadata_page = cachefs_get_page(imetadata, pos / PAGE_SIZE); + cachefs_iput(imetadata); + + if (IS_ERR(metadata_page)) { + printk("CacheFS: Failed to read meta-data page %lu: %ld\n", + pos / PAGE_SIZE, PTR_ERR(metadata_page)); + _leave(" = %ld", PTR_ERR(metadata_page)); + return PTR_ERR(metadata_page); + } + + inode->metadata_page = metadata_page; + + /* initialise the inode from the data we read */ + inode->metadata = __cachefs_get_page_block(inode->metadata_page); + + _debug("Reading inode %lu metadata record {%lu,{%u}}+%04x", + inode->vfs_inode.i_ino, + inode->metadata_page->index, + inode->metadata->bix, + pos); + + inode->vfs_inode.i_atime.tv_nsec = 0; + inode->vfs_inode.i_mtime.tv_nsec = 0; + inode->vfs_inode.i_ctime.tv_nsec = 0; + + metadata = cachefs_metadata_preread(inode); + + inode->vfs_inode.i_mode = S_IFREG | S_IRUGO; + inode->vfs_inode.i_uid = 0; + inode->vfs_inode.i_gid = 0; + inode->vfs_inode.i_nlink = 1; + inode->vfs_inode.i_size = metadata->size; + inode->vfs_inode.i_atime.tv_sec = metadata->mtime; + inode->vfs_inode.i_mtime.tv_sec = metadata->mtime; + inode->vfs_inode.i_ctime.tv_sec = metadata->mtime; + inode->vfs_inode.i_blksize = PAGE_SIZE; + inode->vfs_inode.i_blkbits = PAGE_SHIFT; + inode->vfs_inode.i_blocks = metadata->size; + inode->vfs_inode.i_blocks += inode->vfs_inode.i_blksize - 1; + inode->vfs_inode.i_blocks >>= PAGE_SHIFT; + inode->vfs_inode.i_version = 1; + inode->vfs_inode.i_flags |= S_NOATIME; + inode->vfs_inode.i_op = &cachefs_inode_operations; + inode->vfs_inode.i_fop = &cachefs_file_operations; + + inode->vfs_inode.i_mapping->a_ops = + &cachefs_indr_io_addrspace_operations; + + inode->index_dsize = metadata->index.dsize; + inode->index_esize = metadata->index.esize; + + cachefs_metadata_postread(inode, metadata); + + /* keep a copy of an index's definition too */ + inode->index_epp = 0; + + if (inode->index_esize || + inode->vfs_inode.i_ino == CACHEFS_INO_ROOTDIR + ) { + inode->index_epp = PAGE_SIZE / inode->index_esize; + inode->vfs_inode.i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + inode->vfs_inode.i_nlink = 2; + inode->vfs_inode.i_op = &cachefs_root_inode_operations; + inode->vfs_inode.i_fop = &cachefs_root_file_operations; + + __set_bit(CACHEFS_ACTIVE_INODE_ISINDEX, &inode->flags); + } + + _leave(" = 0"); + return 0; + +} /* end cachefs_iget_fileN() */ + +/*****************************************************************************/ +/* + * attempt to retrieve the inode for a cached file + */ +struct cachefs_inode *cachefs_iget(struct cachefs_super *super, ino_t ino) +{ + struct cachefs_inode *inode; + struct inode *vfs_inode; + unsigned tmp; + loff_t nblocks; + int ret; + + _enter(",%lu,", ino); + + BUG_ON(ino == 0); + + /* it does reside in this cache - create an inode for it */ + vfs_inode = iget_locked(super->sb, ino); + if (!vfs_inode) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + inode = CACHEFS_FS_I(vfs_inode); + + /* deal with an existing inode */ + if (!(inode->vfs_inode.i_state & I_NEW)) { + _leave(" = 0 [exist]"); + return inode; + } + + /* new inode - attempt to find in the on-disc catalogue */ + switch (ino) { + /* they've asked for the virtual inode that mirrors the + * underlying block device */ + case CACHEFS_INO_MISC: + nblocks = i_size_read(super->sb->s_bdev->bd_inode); + do_div(nblocks, PAGE_SIZE); + if (nblocks > UINT_MAX) + nblocks = UINT_MAX; + cachefs_iget_linear_file(inode, nblocks); + break; + + /* they've asked for writeback journal virtual inode */ + case CACHEFS_INO_WBJOURNAL: + tmp = super->layout->bix_cache - super->layout->bix_wbjournal; + cachefs_iget_linear_file(inode, tmp); + break; + + /* they've asked for the status file virtual inode */ + case CACHEFS_INO_STATUS: + cachefs_iget_status_file(inode); + break; + + /* they've asked for the meta-data inode */ + case CACHEFS_INO_METADATA: + inode->metadata_offset = + (ino << super->layout->metadata_bits) & ~PAGE_MASK; + ret = cachefs_iget_file0(inode); + if (ret < 0) + goto bad_inode; + break; + + /* they've asked for an index or a data file cache inode */ + default: + inode->metadata_offset = + (ino << super->layout->metadata_bits) & ~PAGE_MASK; + ret = cachefs_iget_fileN(inode); + if (ret < 0) + goto bad_inode; + break; + } + + /* success */ + unlock_new_inode(&inode->vfs_inode); + + _leave(" = %p", inode); + return inode; + + /* failure */ + bad_inode: + make_bad_inode(&inode->vfs_inode); + unlock_new_inode(&inode->vfs_inode); + iput(&inode->vfs_inode); + + _leave(" = %d [bad]", ret); + return ERR_PTR(ret); + +} /* end cachefs_iget() */ + +/*****************************************************************************/ +/* + * write a cache inode back to disc + * - don't use generic_file_write() to write out the meta-data file's meta-data + * as it updates the mtime & ctime and marks the inode dirty again + */ +int cachefs_write_inode(struct inode *vfs_inode, int sync) +{ + _enter("{sb=%p ino=%lu},%d", vfs_inode->i_sb, vfs_inode->i_ino, sync); + return 0; + +} /* end cachefs_write_inode() */ + +/*****************************************************************************/ +/* + * clear an inode + */ +void cachefs_clear_inode(struct inode *vfs_inode) +{ + _enter("{ino=%lu nl=%u}", vfs_inode->i_ino, vfs_inode->i_nlink); + +} /* end cachefs_clear_inode() */ + +/*****************************************************************************/ +/* + * read the attributes of an inode + */ +int cachefs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + _enter("{ ino=%lu }", dentry->d_inode->i_ino); + + generic_fillattr(dentry->d_inode, stat); + + _leave(" = 0"); + return 0; + +} /* end cachefs_inode_getattr() */ diff -puN /dev/null fs/cachefs/interface.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25-akpm/fs/cachefs/interface.c 2005-06-26 13:42:55.000000000 -0700 @@ -0,0 +1,1473 @@ +/* interface.c: network FS interface to cache + * + * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include "cachefs-int.h" + +struct cachefs_io_end { + cachefs_rw_complete_t func; + void *data; + void *cookie_data; + struct cachefs_block *block; +}; + +LIST_HEAD(cachefs_netfs_list); +LIST_HEAD(cachefs_cache_list); +DECLARE_RWSEM(cachefs_addremove_sem);