head 1.35; access; symbols; locks; strict; comment @ * @; 1.35 date 92.06.30.22.31.43; author mao; state Exp; branches; next 1.34; 1.34 date 92.06.11.17.49.25; author mao; state Exp; branches; next 1.33; 1.33 date 92.05.28.17.09.35; author mao; state Exp; branches; next 1.32; 1.32 date 92.01.29.21.32.50; author mao; state Exp; branches; next 1.31; 1.31 date 91.11.14.19.40.44; author kemnitz; state Exp; branches; next 1.30; 1.30 date 91.11.08.20.18.35; author mao; state Exp; branches; next 1.29; 1.29 date 91.11.07.06.05.53; author mao; state Exp; branches; next 1.28; 1.28 date 91.10.29.06.34.27; author mao; state Exp; branches; next 1.27; 1.27 date 91.10.29.06.33.22; author mao; state Exp; branches; next 1.26; 1.26 date 91.10.29.04.12.35; author mao; state Exp; branches; next 1.25; 1.25 date 91.10.29.00.11.52; author mao; state Exp; branches; next 1.24; 1.24 date 91.10.04.17.52.59; author mao; state Exp; branches; next 1.23; 1.23 date 91.10.03.15.07.32; author mao; state Exp; branches; next 1.22; 1.22 date 91.10.03.00.56.55; author mao; state Exp; branches; next 1.21; 1.21 date 91.09.28.20.04.03; author mao; state Exp; branches; next 1.20; 1.20 date 91.09.11.07.19.37; author mao; state Exp; branches; next 1.19; 1.19 date 91.09.10.23.27.19; author mao; state Exp; branches; next 1.18; 1.18 date 91.09.10.06.41.50; author mao; state Exp; branches; next 1.17; 1.17 date 91.09.09.23.58.55; author mao; state Exp; branches; next 1.16; 1.16 date 91.09.05.23.26.02; author hong; state Exp; branches; next 1.15; 1.15 date 91.08.22.06.33.09; author mao; state Exp; branches; next 1.14; 1.14 date 91.08.13.22.00.30; author mao; state Exp; branches; next 1.13; 1.13 date 91.08.08.05.53.28; author mao; state Exp; branches; next 1.12; 1.12 date 91.08.06.08.09.21; author mao; state Exp; branches; next 1.11; 1.11 date 91.08.06.01.41.44; author mao; state Exp; branches; next 1.10; 1.10 date 91.08.03.00.29.18; author mao; state Exp; branches; next 1.9; 1.9 date 91.07.29.16.52.28; author mer; state Exp; branches; next 1.8; 1.8 date 91.07.26.00.52.21; author mao; state Exp; branches; next 1.7; 1.7 date 91.07.24.23.37.03; author mao; state Exp; branches; next 1.6; 1.6 date 91.07.24.07.47.24; author mao; state Exp; branches; next 1.5; 1.5 date 91.07.22.22.21.11; author mao; state Exp; branches; next 1.4; 1.4 date 91.07.22.08.00.36; author mao; state Exp; branches; next 1.3; 1.3 date 91.07.22.05.32.38; author mao; state Exp; branches; next 1.2; 1.2 date 91.07.21.23.13.32; author mao; state Exp; branches; next 1.1; 1.1 date 91.07.09.00.12.09; author mao; state Exp; branches; next ; desc @sony jukebox storage manager @ 1.35 log @initialize those variables before you go passing them around. @ text @/* * sj.c -- sony jukebox storage manager. * * This code manages relations that reside on the sony write-once * optical disk jukebox. */ #include "tmp/c.h" #include "tmp/postgres.h" #ifdef SONY_JUKEBOX #include #include #include "machine.h" #include "tmp/miscadmin.h" #include "storage/ipc.h" #include "storage/ipci.h" #include "storage/smgr.h" #include "storage/shmem.h" #include "storage/spin.h" #include "utils/hsearch.h" #include "utils/rel.h" #include "utils/log.h" #include "access/htup.h" #include "access/relscan.h" #include "access/heapam.h" #include "catalog/pg_platter.h" #include "catalog/pg_plmap.h" #include "catalog/pg_proc.h" #include "storage/sj.h" RcsId("$Header: /private/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.34 1992/06/11 17:49:25 mao Exp mao $"); /* globals used in this file */ SPINLOCK SJCacheLock; /* lock for cache metadata */ extern ObjectId MyDatabaseId; /* OID of database we have open */ extern Name MyDatabaseName; /* name of database we have open */ static File SJCacheVfd; /* vfd for cache data file */ static File SJMetaVfd; /* vfd for cache metadata file */ static File SJBlockVfd; /* vfd for nblocks file */ static SJCacheHeader *SJHeader; /* pointer to cache header in shmem */ static HTAB *SJCacheHT; /* pointer to hash table in shmem */ static SJCacheItem *SJCache; /* pointer to cache metadata in shmem */ static SJCacheTag *SJNBlockCache; /* pointer to nblock cache */ #ifndef HAS_TEST_AND_SET /* * If we don't have test-and-set locks, then we need a semaphore for * concurrency control. This semaphore is in addition to the metadata * lock, SJCacheLock, that we acquire before touching the cache metadata. * * This semaphore is used in two ways. During cache initialization, we * use it to lock out all other backends that want cache access. During * normal processing, we control access to groups on which IO is in * progress by holding this lock. When we're done with initialization or * IO, we do enough V's on the semaphore to satisfy all outstanding P's. */ static IpcSemaphoreId SJWaitSemId; /* wait semaphore */ static long *SJNWaiting; /* # procs sleeping on the wait sem */ #endif /* ndef HAS_TEST_AND_SET */ /* static buffer is for data transfer */ static char SJCacheBuf[SJBUFSIZE]; /* * When we have to do IO on a group, we avoid holding an exclusive lock on * the cache metadata for the duration of the operation. We do this by * setting a finer-granularity lock on the group itself. How we do this * depends on whether we have test-and-set locks or not. If so, it's * easy; we set the TASlock on the item itself. Otherwise, we use the * 'wait' semaphore described above. */ #ifdef HAS_TEST_AND_SET #define SET_IO_LOCK(item) \ item->sjc_gflags |= SJC_IOINPROG; \ SpinRelease(SJCacheLock); \ S_LOCK(&(item->sjc_iolock)); #else /* HAS_TEST_AND_SET */ #define SET_IO_LOCK(item) \ item->sjc_gflags |= SJC_IOINPROG; \ (*SJNWaiting)++; \ SpinRelease(SJCacheLock); \ IpcSemaphoreLock(SJWaitSemId, 0, 1); #endif /* HAS_TEST_AND_SET */ #define GROUPNO(item) (((char *) item) - ((char *) &(SJCache[0])))/sizeof(SJCacheItem) /* routines declared in this file */ static void _sjcacheinit(); static void _sjwait_init(); static void _sjunwait_init(); static void _sjwait_io(); static void _sjunwait_io(); static void _sjtouch(); static void _sjunpin(); static void _sjregister(); static void _sjregnblocks(); static void _sjnewextent(); static void _sjrdextent(); static void _sjdirtylast(); static int _sjfindnblocks(); static int _sjwritegrp(); static int _sjreadgrp(); static int _sjgroupvrfy(); static Form_pg_plmap _sjchoose(); static SJCacheItem *_sjallocgrp(); static SJCacheItem *_sjfetchgrp(); static SJHashEntry *_sjhashop(); static int _sjgetgrp(); static void _sjdump(); /* routines declared elsewhere */ extern HTAB *ShmemInitHash(); extern int *ShmemInitStruct(); extern Relation RelationIdGetRelation(); extern BlockNumber pgjb_offset(); extern bool pgjb_freespc(); /* * sjinit() -- initialize the Sony jukebox storage manager. * * We need to find (or establish) the mag-disk buffer cache metadata * in shared memory and open the cache on mag disk. The first backend * to run that touches the cache initializes it. All other backends * running simultaneously will only wait for this initialization to * complete if they need to get data out of the cache. Otherwise, * they'll return successfully immediately after attaching the cache * memory, and will let their older sibling do all the work. */ int sjinit() { unsigned int metasize; bool metafound; HASHCTL info; bool initcache; char *cacheblk, *cachesave; int status; char *pghome; char path[SJPATHLEN]; /* * First attach the shared memory block that contains the disk * cache metadata. At the end of this block in shared memory is * the hash table we use to do fast lookup on groups in the cache. */ SpinAcquire(SJCacheLock); #ifdef HAS_TEST_AND_SET metasize = (SJCACHESIZE * sizeof(SJCacheItem)) + sizeof(SJCacheHeader) + (SJNBLKSIZE * sizeof(SJCacheTag)); #else /* HAS_TEST_AND_SET */ metasize = (SJCACHESIZE * sizeof(SJCacheItem)) + sizeof(SJCacheHeader) + (SJNBLKSIZE * sizeof(SJCacheTag)) + sizeof(*SJNWaiting); #endif /* HAS_TEST_AND_SET */ cachesave = cacheblk = (char *) ShmemInitStruct("Jukebox cache metadata", metasize, &metafound); if (cacheblk == (char *) NULL) { SpinRelease(SJCacheLock); return (SM_FAIL); } /* * Order of items in shared memory is metadata header, number of * processes sleeping on the wait semaphore (if no test-and-set locks), * nblock cache, and jukebox cache entries. */ SJHeader = (SJCacheHeader *) cacheblk; cacheblk += sizeof(SJCacheHeader); #ifndef HAS_TEST_AND_SET SJNWaiting = (long *) cacheblk; cacheblk += sizeof(long); #endif /* ndef HAS_TEST_AND_SET */ SJNBlockCache = (SJCacheTag *) cacheblk; cacheblk += SJNBLKSIZE * sizeof(SJCacheTag); SJCache = (SJCacheItem *) cacheblk; /* * Now initialize the pointer to the shared memory hash table. */ info.keysize = sizeof(SJCacheTag); info.datasize = sizeof(int); info.hash = tag_hash; SJCacheHT = ShmemInitHash("Jukebox cache hash table", SJCACHESIZE, SJCACHESIZE, &info, (HASH_ELEM|HASH_FUNCTION)); if (SJCacheHT == (HTAB *) NULL) { SpinRelease(SJCacheLock); return (SM_FAIL); } /* * Okay, all our shared memory pointers are set up. If we did not * find the cache metadata entries in shared memory, or if the cache * has not been initialized from disk, initialize it in this backend. */ if (!metafound || !(SJHeader->sjh_flags & (SJH_INITING|SJH_INITED))) { initcache = true; bzero((char *) cachesave, metasize); SJHeader->sjh_flags = SJH_INITING; #ifdef HAS_TEST_AND_SET S_LOCK(&(SJHeader->sjh_initlock)); #else /* HAS_TEST_AND_SET */ IpcSemaphoreLock(SJWaitSemId, 0, 1); *SJNWaiting = 1; #endif /* HAS_TEST_AND_SET */ } else { initcache = false; } /* don't need exclusive access anymore */ SpinRelease(SJCacheLock); pghome = GetPGHome(); sprintf(path, "%s/data/%s", pghome, SJCACHENAME); SJCacheVfd = PathNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600); if (SJCacheVfd < 0) { SJCacheVfd = PathNameOpenFile(path, O_RDWR, 0600); if (SJCacheVfd < 0) { /* if we were initializing the metadata, note our surrender */ if (!metafound) { SJHeader->sjh_flags &= ~SJH_INITING; _sjunwait_init(); } return (SM_FAIL); } } sprintf(path, "%s/data/%s", pghome, SJMETANAME); SJMetaVfd = PathNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600); if (SJMetaVfd < 0) { SJMetaVfd = PathNameOpenFile(path, O_RDWR, 0600); if (SJMetaVfd < 0) { /* if we were initializing the metadata, note our surrender */ if (!metafound) { SJHeader->sjh_flags &= ~SJH_INITING; _sjunwait_init(); } return (SM_FAIL); } } sprintf(path, "%s/data/%s", pghome, SJBLOCKNAME); SJBlockVfd = PathNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600); if (SJBlockVfd < 0) { SJBlockVfd = PathNameOpenFile(path, O_RDWR, 0600); if (SJBlockVfd < 0) { /* if we were initializing the metadata, note our surrender */ if (!metafound) { SJHeader->sjh_flags &= ~SJH_INITING; _sjunwait_init(); } return (SM_FAIL); } } /* * If it's our responsibility to initialize the shared-memory cache * metadata, then go do that. Sjcacheinit() will elog(FATAL, ...) if * it can't initialize the cache, so we don't need to worry about a * return value here. */ if (initcache) { _sjcacheinit(); } /* * Finally, we need to initialize the data structures we use for * communicating with the jukebox. */ if (pgjb_init() == SM_FAIL) return (SM_FAIL); return (SM_SUCCESS); } static void _sjcacheinit() { int nbytes, nread; int nentries; int nblocks; int i; SJCacheItem *cur; SJHashEntry *result; bool found; /* sanity check */ if ((SJHeader->sjh_flags & SJH_INITED) || !(SJHeader->sjh_flags & SJH_INITING)) { elog(FATAL, "sj cache header metadata corrupted."); } /* suck in the metadata */ nbytes = SJCACHESIZE * sizeof(SJCacheItem); nread = FileRead(SJMetaVfd, (char *) SJCache, nbytes); /* be sure we got an integral number of entries */ nentries = nread / sizeof(SJCacheItem); if ((nentries * sizeof(SJCacheItem)) != nread) { SJHeader->sjh_flags &= ~SJH_INITING; _sjunwait_init(); elog(FATAL, "sj cache metadata file corrupted."); } /* * Clear out the nblock cache */ bzero((char *) SJNBlockCache, SJNBLKSIZE * sizeof(SJCacheTag)); /* add every group that appears in the cache to the hash table */ for (i = 0; i < nentries; i++) { cur = &(SJCache[i]); result = _sjhashop(&(cur->sjc_tag), HASH_ENTER, &found); /* store the group number for this key in the hash table */ result->sjhe_groupno = i; /* no io in progress */ cur->sjc_gflags &= ~SJC_IOINPROG; cur->sjc_refcount = 0; #ifdef HAS_TEST_AND_SET S_UNLOCK(&(cur->sjc_iolock)); #endif HAS_TEST_AND_SET } /* * Now construct the LRU list (free list). Extents will be nominated * for reuse in this order. Since we have no usage information, we * adopt the following policy: any extents not yet allocated in the * cache are come first in the list, in order. These are followed by * the allocated extents, in order. The free list head is the first * unallocated extent, and its tail is the last allocated one. This * list is doubly-linked and is not circular. */ if (nentries == SJCACHESIZE || nentries == 0) { cur = &(SJCache[i]); cur->sjc_freeprev = i - 1; if (i == SJCACHESIZE - 1) { cur->sjc_freenext = -1; } else { cur->sjc_freenext = i + 1; } /* list head, tail pointers */ SJHeader->sjh_freehead = 0; SJHeader->sjh_freetail = SJCACHESIZE - 1; } else { for (i = 0; i < nentries; i++) { cur = &(SJCache[i]); if (i == 0) cur->sjc_freeprev = SJCACHESIZE - 1; else cur->sjc_freeprev = i - 1; if (i == nentries - 1) cur->sjc_freenext = -1; else cur->sjc_freenext = i + 1; } for (i = nentries; i < SJCACHESIZE; i++) { cur = &(SJCache[i]); /* mark this as unused by setting oid to invalid object id */ cur->sjc_oid = InvalidObjectId; if (i == nentries) cur->sjc_freeprev = -1; else cur->sjc_freeprev = i - 1; if (i == SJCACHESIZE - 1) cur->sjc_freenext = 0; else cur->sjc_freenext = i + 1; } /* list head, tail pointers */ SJHeader->sjh_freehead = nentries; SJHeader->sjh_freetail = nentries - 1; } /* set up cache metadata header struct */ SJHeader->sjh_nentries = 0; SJHeader->sjh_flags = SJH_INITED; } /* * _sjunwait_init() -- Release initialization lock on the jukebox cache. * * When we initialize the cache, we don't keep the cache semaphore * locked. Instead, we set a flag in the metadata to let other * backends know that we're doing the initialization. This lets * others start running queries immediately, even if the cache is * not yet populated. If they want to look something up in the * cache, they'll block on the flag we set, and wait for us to finish. * If they don't need the jukebox, they can run unimpeded. When we * finish, we call _sjunwait_init() to release the initialization lock * that we hold during initialization. * * When we do this, either the cache is properly initialized, or * we detected some error we couldn't deal with. In either case, * we no longer need exclusive access to the cache metadata. */ static void _sjunwait_init() { #ifdef HAS_TEST_AND_SET S_UNLOCK(&(SJHeader->sjh_initlock)); #else /* HAS_TEST_AND_SET */ /* atomically V the semaphore once for every waiting process */ SpinAcquire(SJCacheLock); IpcSemaphoreUnlock(SJWaitSemId, 0, *SJNWaiting); *SJNWaiting = 0; SpinRelease(SJCacheLock); #endif /* HAS_TEST_AND_SET */ } /* * _sjunwait_io() -- Release IO lock on the jukebox cache. * * While we're doing IO on a particular group in the cache, any other * process that wants to touch that group needs to wait until we're * finished. If we have TASlocks, then a wait lock appears on the * group entry in the cache metadata. Otherwise, we use the wait * semaphore in the same way as for initialization, above. */ static void _sjunwait_io(item) SJCacheItem *item; { item->sjc_gflags &= ~SJC_IOINPROG; #ifdef HAS_TEST_AND_SET S_UNLOCK(&(item->sjc_iolock)); #else /* HAS_TEST_AND_SET */ /* atomically V the wait semaphore once for each sleeping process */ SpinAcquire(SJCacheLock); if (*SJNWaiting > 0) { IpcSemaphoreUnlock(SJWaitSemId, 0, *SJNWaiting); *SJNWaiting = 0; } SpinRelease(SJCacheLock); #endif /* HAS_TEST_AND_SET */ } /* * _sjwait_init() -- Wait for cache initialization to complete. * * This routine is called when we want to access jukebox cache metadata, * but someone else is initializing it. When we return, the init lock * has been released and we can retry our access. On entry, we must be * holding the cache metadata lock. */ static void _sjwait_init() { #ifdef HAS_TEST_AND_SET SpinRelease(SJCacheLock); S_LOCK(&(SJHeader->sjh_initlock)); S_UNLOCK(&(SJHeader->sjh_initlock)); #else /* HAS_TEST_AND_SET */ (*SJNWaiting)++; SpinRelease(SJCacheLock); IpcSemaphoreLock(SJWaitSemId, 0, 1); #endif /* HAS_TEST_AND_SET */ } /* * _sjwait_io() -- Wait for group IO to complete. * * This routine is called when we discover that some other process is * doing IO on a group in the cache that we want to use. We need to * wait for that IO to complete before we can use the group. On entry, * we must hold the cache metadata lock. On return, we don't hold that * lock, and the IO completed. We can retry our access. */ static void _sjwait_io(item) SJCacheItem *item; { #ifdef HAS_TEST_AND_SET SpinRelease(SJCacheLock); S_LOCK(&(item->sjc_iolock)); S_UNLOCK(&(item->sjc_iolock)); #else /* HAS_TEST_AND_SET */ (*SJNWaiting)++; SpinRelease(SJCacheLock); IpcSemaphoreLock(SJWaitSemId, 0, 1); #endif /* HAS_TEST_AND_SET */ } /* * sjshutdown() -- shut down the jukebox storage manager. * * We want to close the cache and metadata files, release all our open * jukebox connections, and let the caller know we're done. */ int sjshutdown() { FileClose(SJCacheVfd); FileClose(SJMetaVfd); return (SM_SUCCESS); } /* * sjcreate() -- Create the requested relation on the jukebox. * * Creating a new relation requires us to make a new cache group, * fill in the descriptor page, make sure everything is on disk, * and create the new relation file to store the last page of data * on magnetic disk. */ int sjcreate(reln) Relation reln; { SJCacheItem *item; SJGroupDesc *group; SJCacheTag tag; ObjectId dbid; ObjectId relid; File vfd; int grpno; int i; char path[SJPATHLEN]; /* * If the cache is in the process of being initialized, then we need * to wait for initialization to complete. If the cache is not yet * initialized, and no one else is doing it, then we need to initialize * it ourselves. Sjwait_init() or sj_init() will release the cache * lock for us. */ SpinAcquire(SJCacheLock); if (!(SJHeader->sjh_flags & SJH_INITED)) { if (SJHeader->sjh_flags & SJH_INITING) { _sjwait_init(); } else { sjinit(); } return (sjcreate(reln)); } SpinRelease(SJCacheLock); /* * By here, cache is initialized. We are aggressively lazy, and * will not allocate an initial extent for this relation until it's * actually used. We just register an initial block count of zero. */ if (reln->rd_rel->relisshared) tag.sjct_dbid = (ObjectId) 0; else tag.sjct_dbid = MyDatabaseId; tag.sjct_relid = reln->rd_id; tag.sjct_base = (BlockNumber) 0; _sjregnblocks(&tag); /* last thing to do is to create the mag-disk file to hold last page */ if (reln->rd_rel->relisshared) strcpy(path, "../"); else path[0] = '\0'; strncpy(path, &(reln->rd_rel->relname.data[0]), sizeof(NameData)); vfd = FileNameOpenFile(path, O_CREAT|O_RDWR|O_EXCL, 0600); return (vfd); } /* * _sjregister() -- Make catalog entry for a new extent * * When we create a new jukebox relation, or when we add a new extent * to an existing relation, we need to make the appropriate entry in * pg_plmap(). This routine does that. * * On entry, we have item pinned; on exit, it's still pinned, and the * system catalogs have been updated to reflect the presence of the * new extent. */ static void _sjregister(item, group) SJCacheItem *item; SJGroupDesc *group; { Relation plmap; ObjectId plid; Form_pg_plmap plmdata; HeapTuple plmtup; /* * Choose a platter to put the new extent on. This returns a filled-in * pg_plmap tuple data part to insert into the system catalogs. The * choose routine also figures out where to place the extent on the * platter. * * Sjchoose() palloc's and fills in plmdata; we free it later in this * routine. */ plmdata = _sjchoose(item); /* record plid, offset, extent size for caller */ group->sjgd_plid = plmdata->plid; group->sjgd_jboffset = plmdata->ploffset; group->sjgd_extentsz = plmdata->plextentsz; plmtup = (HeapTuple) heap_addheader(Natts_pg_plmap, sizeof(FormData_pg_plmap), (char *) plmdata); /* clean up the memory that heap_addheader() palloc()'ed for us */ plmtup->t_oid = InvalidObjectId; bzero((char *) &(plmtup->t_chain), sizeof(plmtup->t_chain)); /* open the relation and lock it */ plmap = heap_openr(Name_pg_plmap); RelationSetLockForWrite(plmap); /* insert the new catalog tuple */ heap_insert(plmap, plmtup, (double *) NULL); /* done */ heap_close(plmap); /* be tidy */ pfree((char *) plmtup); pfree((char *) plmdata); } /* * _sjchoose() -- Choose a platter to receive a new extent. * * Allocation strategy is: * * + For the first extent of a new relation, put it on the first * with room for a new relation. The policy for allocating new * relations to a platter is implemented by pgjb_freespc(). * * + For second and subsequent extents of an existing relation: * * - If there's a platter holding another extent for this * relation, and that platter has room for this extent, * allocate it there. NOTE: this is true in the current * implementation, but it's a side effect of the way in which * we scan for free space on platters (we consider platters * in the same order every time we look). * * - Otherwise, allocate the extent on the first platter with * space for a new extent. */ static Form_pg_plmap _sjchoose(item) SJCacheItem *item; { Relation plat; TupleDescriptor platdesc; HeapScanDesc platscan; HeapTuple plattup; Buffer buf; Form_pg_plmap plmdata; ObjectId plid; Datum d; Name platname; char *plname; bool isnull; bool done; int alloctype; /* allocate the tuple form */ plmdata = (Form_pg_plmap) palloc(sizeof(FormData_pg_plmap)); plname = (char *) palloc(sizeof(NameData) + 1); plat = heap_openr(Name_pg_platter); /* * We do short-term (non-two-phase) locking on the platter relation * in order to guarantee serial allocations. */ RelationSetLockForWrite(plat); platdesc = RelationGetTupleDescriptor(plat); platscan = heap_beginscan(plat, false, NowTimeQual, 0, NULL); /* figure out if this is a new or an old relation allocation */ alloctype = (item->sjc_tag.sjct_base > 0 ? SJOLDRELN : SJNEWRELN); /* find a qualifying tuple in pg_platter */ plattup = heap_getnext(platscan, false, &buf); if (!HeapTupleIsValid(plattup)) elog(WARN, "_sjchoose: no platters in pg_plmap"); done = false; do { /* get platter OID, name */ plid = plmdata->plid = plattup->t_oid; d = (Datum) heap_getattr(plattup, buf, Anum_pg_platter_plname, platdesc, &isnull); platname = DatumGetName(d); strncpy(plname, &(platname->data[0]), sizeof(NameData)); plname[sizeof(NameData)] = '\0'; done = pgjb_freespc(plname, plid, alloctype); /* done with this tuple */ ReleaseBuffer(buf); /* next tuple */ if (!done) { plattup = heap_getnext(platscan, false, &buf); if (!HeapTupleIsValid(plattup)) elog(WARN, "_sjchoose: no space on platters in pg_plmap"); } } while (!done); /* init the rest of the fields */ plmdata->pldbid = item->sjc_tag.sjct_dbid; plmdata->plrelid = item->sjc_tag.sjct_relid; plmdata->plblkno = item->sjc_tag.sjct_base; plmdata->plextentsz = SJEXTENTSZ; plmdata->ploffset = pgjb_offset(plname, plmdata->plid, plmdata->plextentsz); /* no longer need an exclusive lock for the allocation */ RelationUnsetLockForWrite(plat); heap_endscan(platscan); heap_close(plat); /* save platter name, id, offset in item */ bcopy(plname, &(item->sjc_plname.data[0]), sizeof(NameData)); item->sjc_plid = plmdata->plid; item->sjc_jboffset = plmdata->ploffset; return (plmdata); } /* * _sjallocgrp() -- Allocate a new group in the cache for use by some * relation. * * If there are any unused slots in the cache, we just return one * of those. Otherwise, we need to kick out the least-recently-used * group and make room for another. * * On entry, we hold the cache metadata lock. On exit, we still hold * it. In between, we may release it in order to do I/O on the cache * group we're kicking out, if we have to do that. */ static SJCacheItem * _sjallocgrp(grpno) int *grpno; { SJCacheItem *item; /* free list had better not be empty */ if (SJHeader->sjh_nentries == SJCACHESIZE) elog(FATAL, "_sjallocgrp: no groups on free list!"); /* * Get a new group off the free list. As a side effect, _sjgetgrp() * bumps the ref count on the group for us. */ *grpno = _sjgetgrp(); item = &SJCache[*grpno]; return (item); } /* * _sjgetgrp() -- Get a group off the free list. * * This routine returns the least-recently used group on the free list * to the caller. If necessary, the (old) contents of the group are * forced to the platter. On entry, we hold the cache metadata lock. * We release it and mark IOINPROG on the group if we need to do any * io. We reacquire the lock before returning. * * We know that there's something on the free list when we call this * routine. * * There's an interesting problem with write-once media that we have * to deal with here. It is possible in postgres for a half-full * buffer to be flushed to stable storage, then to be reloaded into * the buffer cache, filled completely, and for a new page to be * allocated before the old page is flushed again. If this happens * to us, it's possible for the half-full page to get flushed all the * way through to an optical disk platter, where it can never be * overwritten. * * In order to deal with this, we probe the buffer manager for all * dirty blocks it has that live on an extent before we flush the * extent to permanent storage. */ static int _sjgetgrp() { SJCacheItem *item; int grpno; int where; long loc; bool found; int grpoffset; BlockNumber nblocks; Relation reln; bool dirty; int i; int offset; ObjectId dbid; ObjectId relid; BlockNumber base; /* pull the least-recently-used group off the free list */ grpno = SJHeader->sjh_freehead; item = &(SJCache[grpno]); _sjtouch(item); /* if it was previously a valid group, remove it from the hash table */ if (item->sjc_oid != InvalidObjectId) _sjhashop(&(item->sjc_tag), HASH_REMOVE, &found); /* * See if we need to flush the group to the jukebox. If we're working * with an entirely new item (the corresponding cache slot is empty), * dbid == relid == base == 0, so we can ignore the flags. Otherwise, * we check every flags entry in the group descriptor to see if anyone * wants to get flushed. */ dirty = false; if (item->sjc_tag.sjct_dbid != 0 || item->sjc_tag.sjct_relid != 0) { if (MUST_FLUSH(item->sjc_gflags)) { dirty = true; } else { for (i = 0; i < SJGRPSIZE; i++) { if (MUST_FLUSH(item->sjc_flags[i])) { dirty = true; break; } } } } if (!dirty) return (grpno); /* * By here, we need to force the group to stable storage outside the * cache. Mark IOINPROG on the group (in fact, this shouldn't matter, * since no one should be able to get at it -- we just got it off the * free list and removed its hash table entry), release our exclusive * lock, and write it out. */ SET_IO_LOCK(item); if (_sjreadgrp(item, grpno) == SM_FAIL) { _sjunwait_io(item); elog(FATAL, "_sjgetgrp: cannot read group %d", grpno); } /* * Probe the buffer manager for dirty blocks that belong in this * extent. The buffer manager will copy them into the space we * pass in, and will mark them clean in the buffer cache. */ dbid = item->sjc_tag.sjct_dbid; relid = item->sjc_tag.sjct_relid; base = item->sjc_tag.sjct_base; for (i = 0; i < SJGRPSIZE; i++) { if (MUST_FLUSH(item->sjc_flags[i])) { offset = (i * BLCKSZ) + JBBLOCKSZ; DirtyBufferCopy(dbid, relid, base + i, &(SJCacheBuf[offset])); } } nblocks = _sjfindnblocks(&(item->sjc_tag)); if (pgjb_wrtextent(item, nblocks, &(SJCacheBuf[0])) == SM_FAIL) { _sjunwait_io(item); elog(FATAL, "_sjfree: cannot free group."); } _sjunwait_io(item); /* give us back our exclusive lock */ SpinAcquire(SJCacheLock); return (grpno); } static SJCacheItem * _sjfetchgrp(dbid, relid, blkno, grpno) ObjectId dbid; ObjectId relid; int blkno; int *grpno; { SJCacheItem *item; SJHashEntry *entry; bool found; SJCacheTag tag; SpinAcquire(SJCacheLock); tag.sjct_dbid = dbid; tag.sjct_relid = relid; tag.sjct_base = blkno; entry = _sjhashop(&tag, HASH_FIND, &found); if (found) { *grpno = entry->sjhe_groupno; item = &(SJCache[*grpno]); if (item->sjc_gflags & SJC_IOINPROG) { _sjwait_io(item); return (_sjfetchgrp(dbid, relid, blkno, grpno)); } _sjtouch(item); SpinRelease(SJCacheLock); } else { item = _sjallocgrp(grpno); /* * Possible race condition: someone else instantiated the extent * we want while we were off allocating a group for it. If that * happened, we want to put our just-allocated group back on the * free list for someone else to use. */ entry = _sjhashop(&tag, HASH_FIND, &found); if (found) { /* * Put the just-allocated group back on the free list. This * requires us to reenter it into the hash table if it refers * to actual data. We only want to do this if we got a different * free group from the other process. */ if (entry->sjhe_groupno != *grpno) { if (item->sjc_oid != InvalidObjectId) (void) _sjhashop(&(item->sjc_tag), HASH_ENTER, &found); _sjunpin(item); } item = &(SJCache[entry->sjhe_groupno]); /* if io in progress, wait for it to complete and try again */ if (item->sjc_gflags & SJC_IOINPROG) { _sjunpin(item); _sjwait_io(item); return (_sjfetchgrp(dbid, relid, blkno)); } SpinRelease(SJCacheLock); } else { /* okay, we need to read the extent from a platter */ bcopy((char *) &tag, (char *) &(item->sjc_tag), sizeof(tag)); entry = _sjhashop(&tag, HASH_ENTER, &found); entry->sjhe_groupno = *grpno; SET_IO_LOCK(item); /* read the extent off the optical platter */ _sjrdextent(item); /* update the magnetic disk cache */ _sjwritegrp(item, *grpno); /* done, release IO lock */ _sjunwait_io(item); } } return (item); } /* * _sjrdextent() -- Read an extent from an optical platter. * * This routine prepares the SJCacheItem group for the pgjb_rdextent() * routine to work with, and passes it along. We don't have exclusive * access to the cache metadata on entry, but we do have the IOINPROGRESS * bit set on the item we're working with, so on one else will screw * around with it. */ static void _sjrdextent(item) SJCacheItem *item; { Relation reln; HeapScanDesc hscan; HeapTuple htup; TupleDescriptor tupdesc; Datum d; Boolean n; Name plname; ScanKeyEntryData skey[3]; /* first get platter id and offset from pg_plmap */ reln = heap_openr(Name_pg_plmap); tupdesc = RelationGetTupleDescriptor(reln); ScanKeyEntryInitialize(&skey[0], 0x0, Anum_pg_plmap_pldbid, ObjectIdEqualRegProcedure, ObjectIdGetDatum(item->sjc_tag.sjct_dbid)); ScanKeyEntryInitialize(&skey[1], 0x0, Anum_pg_plmap_plrelid, ObjectIdEqualRegProcedure, ObjectIdGetDatum(item->sjc_tag.sjct_relid)); ScanKeyEntryInitialize(&skey[2], 0x0, Anum_pg_plmap_plblkno, Integer32EqualRegProcedure, Int32GetDatum(item->sjc_tag.sjct_base)); hscan = heap_beginscan(reln, false, NowTimeQual, 3, &skey[0]); /* * if there is no matching entry in the platter map, then we're * asking for an extent that has not yet been allocated. in this * case, we return a zero-filled extent. this happens, for example, * when we try to read the initial block of a relation before one * has been written. */ if (!HeapTupleIsValid(htup = heap_getnext(hscan, false, (Buffer *) NULL))) { heap_endscan(hscan); heap_close(reln); bzero(&(SJCacheBuf[0]), SJBUFSIZE); return; } d = (Datum) heap_getattr(htup, InvalidBuffer, Anum_pg_plmap_plid, tupdesc, &n); item->sjc_plid = DatumGetObjectId(d); d = (Datum) heap_getattr(htup, InvalidBuffer, Anum_pg_plmap_ploffset, tupdesc, &n); item->sjc_jboffset = DatumGetInt32(d); heap_endscan(hscan); heap_close(reln); /* now figure out the platter's name from pg_platter */ reln = heap_openr(Name_pg_platter); tupdesc = RelationGetTupleDescriptor(reln); ScanKeyEntryInitialize(&skey[0], 0x0, ObjectIdAttributeNumber, ObjectIdEqualRegProcedure, ObjectIdGetDatum(item->sjc_plid)); hscan = heap_beginscan(reln, false, NowTimeQual, 1, &skey[0]); if (!HeapTupleIsValid(htup = heap_getnext(hscan, false, (Buffer *) NULL))) { _sjunwait_io(item); elog(WARN, "_sjrdextent: cannot find platter oid %d", item->sjc_plid); } d = (Datum) heap_getattr(htup, InvalidBuffer, Anum_pg_platter_plname, tupdesc, &n); plname = DatumGetName(d); bcopy(&(plname->data[0]), &(item->sjc_plname.data[0]), sizeof(NameData)); heap_endscan(hscan); heap_close(reln); /* * Okay, by here, we have all the fields in item filled in except for * sjc_oid, sjc_gflags, and sjc_flags[]. Those are all filled in by * pgjb_rdextent(), so we call that routine to do the work. */ if (pgjb_rdextent(item, &SJCacheBuf[0]) == SM_FAIL) { _sjunwait_io(item); elog(WARN, "read of extent <%d,%d,%d> from platter %d failed", item->sjc_tag.sjct_dbid, item->sjc_tag.sjct_relid, item->sjc_tag.sjct_base, item->sjc_plid); } } /* * _sjtouch() -- Increment reference count on the supplied item. * * If this is the first reference to the item, we remove it from the * free list. On entry and exit, we hold SJCacheLock. If we pulled * the item off the free list, we adjust SJHeader->sjh_nentries. */ static void _sjtouch(item) SJCacheItem *item; { /* * Bump the reference count to this group. If it's the first * reference, pull the group off the free list. */ if (++(item->sjc_refcount) == 1) { /* if at the start of the free list, adjust 'head' pointer */ if (item->sjc_freeprev != -1) SJCache[item->sjc_freeprev].sjc_freenext = item->sjc_freenext; else SJHeader->sjh_freehead = item->sjc_freenext; /* if at the end of the free list, adjust 'tail' pointer */ if (item->sjc_freenext != -1) SJCache[item->sjc_freenext].sjc_freeprev = item->sjc_freeprev; else SJHeader->sjh_freetail = item->sjc_freeprev; /* disconnect from free list */ item->sjc_freeprev = item->sjc_freenext = -1; /* keep track of number of groups allocated */ (SJHeader->sjh_nentries)++; } } /* * _sjunpin() -- Decrement reference count on the supplied item. * * If we are releasing the last reference to the supplied item, we put * it back on the free list. On entry and exit, we do not hold the * cache lock. We must acquire it in order to carry out the requested * release. */ static void _sjunpin(item) SJCacheItem *item; { int grpno; /* exclusive access */ SpinAcquire(SJCacheLock); /* item had better be pinned */ if (item->sjc_refcount <= 0) elog(FATAL, "_sjunpin: illegal reference count"); /* * Unpin the item. If this is the last reference, put the item at the * end of the free list. Implemenation note: if SJHeader->sjh_freehead * is -1, then the list is empty, and SJHeader->sjh_freetail is also -1. */ if (--(item->sjc_refcount) == 0) { grpno = GROUPNO(item); if (SJHeader->sjh_freehead == -1) { SJHeader->sjh_freehead = grpno; } else { item->sjc_freeprev = SJHeader->sjh_freetail; SJCache[SJHeader->sjh_freetail].sjc_freenext = grpno; } /* put item at end of free list */ SJHeader->sjh_freetail = grpno; (SJHeader->sjh_nentries)--; } SpinRelease(SJCacheLock); } static int _sjwritegrp(item, grpno) SJCacheItem *item; int grpno; { long seekpos; long loc; int nbytes, i; char *buf; /* first update the metadata file */ seekpos = grpno * sizeof(*item); if ((loc = FileSeek(SJMetaVfd, seekpos, L_SET)) != seekpos) return (SM_FAIL); nbytes = sizeof(*item); buf = (char *) item; while (nbytes > 0) { i = FileWrite(SJMetaVfd, buf, nbytes); if (i < 0) return (SM_FAIL); nbytes -= i; buf += i; } FileSync(SJMetaVfd); /* now update the cache file */ seekpos = grpno * SJBUFSIZE; if ((loc = FileSeek(SJCacheVfd, seekpos, L_SET)) != seekpos) return (SM_FAIL); nbytes = SJBUFSIZE; buf = &(SJCacheBuf[0]); while (nbytes > 0) { i = FileWrite(SJCacheVfd, buf, nbytes); if (i < 0) return (SM_FAIL); nbytes -= i; buf += i; } FileSync(SJCacheVfd); return (SM_SUCCESS); } /* * sjextend() -- extend a relation by one block. */ int sjextend(reln, buffer) Relation reln; char *buffer; { SJCacheItem *item; SJHashEntry *entry; SJCacheTag tag; int grpno; int nblocks; int base; int offset; bool found; int grpoffset; long seekpos; RelationSetLockForExtend(reln); nblocks = sjnblocks(reln); base = (nblocks / SJGRPSIZE) * SJGRPSIZE; SpinAcquire(SJCacheLock); /* * If the highest extent is full, we need to allocate a new group in * the cache. As a side effect, _sjnewextent will release SJCacheLock. * We need to reacquire it immediately afterwards. */ if ((nblocks % SJGRPSIZE) == 0) { _sjnewextent(reln, base); SpinAcquire(SJCacheLock); } if (reln->rd_rel->relisshared) tag.sjct_dbid = (ObjectId) 0; else tag.sjct_dbid = MyDatabaseId; tag.sjct_relid = reln->rd_id; tag.sjct_base = base; entry = _sjhashop(&tag, HASH_FIND, &found); if (!found) { SpinRelease(SJCacheLock); elog(WARN, "sjextend: hey mao: your group is missing."); } /* find the item and block in the item to write */ grpno = entry->sjhe_groupno; item = &SJCache[grpno]; grpoffset = nblocks % SJGRPSIZE; /* * Okay, allocate the next block in this extent by marking it 'not * missing'. Once we've done this, we must hold the extend lock * until end of transaction, since the number of allocated blocks no * longer matches the block count visible to other backends. */ if (!(item->sjc_flags[grpoffset] & SJC_MISSING)) { SpinRelease(SJCacheLock); elog(WARN, "sjextend: cache botch: next block in group present"); } else { item->sjc_flags[grpoffset] &= ~SJC_MISSING; } _sjtouch(item); SET_IO_LOCK(item); /* page is allocated */ item->sjc_flags[grpoffset] = SJC_CLEAR; /* verify group descriptor data in the cache file */ if (_sjgroupvrfy(item, grpno) == SM_FAIL) { _sjunpin(item); _sjunwait_io(item); return (SM_FAIL); } /* write the page */ seekpos = (grpno * SJBUFSIZE) + ((nblocks % SJGRPSIZE) * BLCKSZ) + JBBLOCKSZ; if (FileSeek(SJCacheVfd, seekpos, L_SET) != seekpos) { elog(NOTICE, "sjextend: failed to seek to buffer lock (%d)", seekpos); _sjunpin(item); _sjunwait_io(item); return (SM_FAIL); } if (FileWrite(SJCacheVfd, buffer, BLCKSZ) != BLCKSZ) { elog(NOTICE, "sjextend: can't write page %d", nblocks); _sjunwait_io(item); _sjunpin(item); return (SM_FAIL); } /* write the updated cache metadata entry */ seekpos = grpno * sizeof(*item); if (FileSeek(SJMetaVfd, seekpos, L_SET) != seekpos) { elog(NOTICE, "sjextend: seek to %d on metadata file failed", seekpos); _sjunwait_io(item); _sjunpin(item); return (SM_FAIL); } if (FileWrite(SJMetaVfd, (char *) item, sizeof(*item)) < 0) { elog(NOTICE, "sjextend: write of metadata file failed"); _sjunwait_io(item); _sjunpin(item); return (SM_FAIL); } /* success */ _sjunwait_io(item); _sjunpin(item); tag.sjct_base = ++nblocks; _sjregnblocks(&tag); return (SM_SUCCESS); } static int _sjreadgrp(item, grpno) SJCacheItem *item; int grpno; { long seekpos; long loc; int nbytes, i; char *buf; SJGroupDesc *gdesc; /* get the group from the cache file */ seekpos = grpno * SJBUFSIZE; if ((loc = FileSeek(SJCacheVfd, seekpos, L_SET)) != seekpos) { elog(NOTICE, "_sjreadgrp: cannot seek"); return (SM_FAIL); } nbytes = SJBUFSIZE; buf = &(SJCacheBuf[0]); while (nbytes > 0) { i = FileRead(SJCacheVfd, buf, nbytes); if (i < 0) { elog(NOTICE, "_sjreadgrp: read failed"); return (SM_FAIL); } nbytes -= i; buf += i; } gdesc = (SJGroupDesc *) &(SJCacheBuf[0]); if (gdesc->sjgd_magic != SJGDMAGIC || gdesc->sjgd_version != SJGDVERSION || gdesc->sjgd_groupoid != item->sjc_oid) { elog(NOTICE, "_sjreadgrp: trashed cache"); return (SM_FAIL); } return (SM_SUCCESS); } int sjunlink(reln) Relation reln; { return (SM_FAIL); } /* * _sjnewextent() -- Add a new extent to a relation in the jukebox cache. */ static void _sjnewextent(reln, base) Relation reln; BlockNumber base; { SJHashEntry *entry; SJGroupDesc *group; SJCacheItem *item; bool found; int grpno; int i; item = _sjallocgrp(&grpno); if (reln->rd_rel->relisshared) item->sjc_tag.sjct_dbid = (ObjectId) 0; else item->sjc_tag.sjct_dbid = MyDatabaseId; item->sjc_tag.sjct_relid = (ObjectId) reln->rd_id; item->sjc_tag.sjct_base = base; entry = _sjhashop(&(item->sjc_tag), HASH_ENTER, &found); entry->sjhe_groupno = grpno; SET_IO_LOCK(item); /* set flags on item, initialize group descriptor block */ item->sjc_gflags = SJC_CLEAR; for (i = 0; i < SJGRPSIZE; i++) item->sjc_flags[i] = SJC_MISSING; /* should be smarter and only bzero what we need to */ bzero(SJCacheBuf, SJBUFSIZE); group = (SJGroupDesc *) (&SJCacheBuf[0]); group->sjgd_magic = SJGDMAGIC; group->sjgd_version = SJGDVERSION; if (reln->rd_rel->relisshared) { group->sjgd_dbid = (ObjectId) 0; } else { strncpy(&(group->sjgd_dbname.data[0]), &(MyDatabaseName->data[0]), sizeof(NameData)); group->sjgd_dbid = (ObjectId) MyDatabaseId; } strncpy(&(group->sjgd_relname.data[0]), &(reln->rd_rel->relname.data[0]), sizeof(NameData)); group->sjgd_relid = reln->rd_id; group->sjgd_relblkno = base; item->sjc_oid = group->sjgd_groupoid = newoid(); /* * Record the presence of the new extent in the system catalogs. The * plid, jboffset, and extentsz fields are filled in by _sjregister() * or the routines that it calls. Note that we do not force the new * group descriptor block all the way to the optical platter here. * We do decide where to place it, however, and must go to a fair amount * of trouble elsewhere in the code to avoid allocating the same extent * to a different relation, or block within the same relation. */ _sjregister(item, group); /* * Write the new group cache entry to disk. Sjwritegrp() knows where * the cache buffer begins, and forces out the group descriptor we * just set up. */ if (_sjwritegrp(item, grpno) == SM_FAIL) { _sjunwait_io(item); elog(FATAL, "_sjnewextent: cannot write new extent to disk"); } _sjregnblocks(&(item->sjc_tag)); /* can now release i/o lock on the item we just added */ _sjunwait_io(item); /* no longer need the reference */ _sjunpin(item); } /* * _sjhashop() -- Do lookup, insertion, or deletion on the metadata hash * table in shared memory. * * We don't worry about the number of entries in the hash table here; * that's handled at a higher level (_sjallocgrp and _sjgetgrp). We * hold SJCacheLock on entry. */ static SJHashEntry * _sjhashop(tagP, op, foundP) SJCacheTag *tagP; HASHACTION op; bool *foundP; { SJHashEntry *entry; entry = (SJHashEntry *) hash_search(SJCacheHT, (char *) tagP, op, foundP); if (entry == (SJHashEntry *) NULL) { SpinRelease(SJCacheLock); elog(FATAL, "_sjhashop: hash table corrupt."); } if (*foundP) { if (op == HASH_ENTER) { SpinRelease(SJCacheLock); elog(WARN, "_sjhashop: cannot enter <%d,%d,%d>: already exists", tagP->sjct_dbid, tagP->sjct_relid, tagP->sjct_base); } } else { if (op == HASH_REMOVE) { SpinRelease(SJCacheLock); elog(WARN, "_sjhashop: cannot delete <%d,%d,%d>: missing", tagP->sjct_dbid, tagP->sjct_relid, tagP->sjct_base); } } return (entry); } int sjopen(reln) Relation reln; { char *path; int fd; extern char *relpath(); path = relpath(&(reln->rd_rel->relname.data[0])); fd = FileNameOpenFile(path, O_RDWR, 0600); return (fd); } int sjclose(reln) Relation reln; { FileClose(reln->rd_fd); return (SM_SUCCESS); } int sjread(reln, blocknum, buffer) Relation reln; BlockNumber blocknum; char *buffer; { SJCacheItem *item; ObjectId reldbid; BlockNumber base; int offset; int grpno; long seekpos; /* fake successful read on non-existent data */ if (sjnblocks(reln) <= blocknum) { bzero(buffer, BLCKSZ); return (SM_SUCCESS); } if (reln->rd_rel->relisshared) reldbid = (ObjectId) 0; else reldbid = MyDatabaseId; base = (blocknum / SJGRPSIZE) * SJGRPSIZE; item = _sjfetchgrp(reldbid, reln->rd_id, base, &grpno); /* shd expand _sjfetchgrp() inline to avoid extra semop()s */ SpinAcquire(SJCacheLock); SET_IO_LOCK(item); /* First read and verify the group descriptor metadata */ if (_sjgroupvrfy(item, grpno) == SM_FAIL) { _sjunpin(item); _sjunwait_io(item); return (SM_FAIL); } /* By here, group descriptor metadata is okay */ seekpos = (grpno * SJBUFSIZE) + ((blocknum % SJGRPSIZE) * BLCKSZ) + JBBLOCKSZ; if (FileSeek(SJCacheVfd, seekpos, L_SET) != seekpos) { elog(NOTICE, "sjread: failed to seek to buffer lock (%d)", seekpos); _sjunpin(item); _sjunwait_io(item); return (SM_FAIL); } /* read the requested page */ if (FileRead(SJCacheVfd, buffer, BLCKSZ) != BLCKSZ) { elog(NOTICE, "sjread: can't read page %d", blocknum); _sjunwait_io(item); return (SM_FAIL); } _sjunwait_io(item); _sjunpin(item); return (SM_SUCCESS); } static int _sjgroupvrfy(item, grpno) SJCacheItem *item; int grpno; { long seekpos; SJGroupDesc gdesc; seekpos = SJBUFSIZE * grpno; if (FileSeek(SJCacheVfd, seekpos, L_SET) != seekpos) { elog(NOTICE, "sjgroupvrfy: Cannot seek to %d on sj cache file", seekpos); return (SM_FAIL); } if (FileRead(SJCacheVfd, (char *) &gdesc, sizeof(gdesc)) < 0) { elog(NOTICE, "sjgroupvrfy: Cannot read group desc from sj cache file"); return (SM_FAIL); } if (gdesc.sjgd_magic != SJGDMAGIC || gdesc.sjgd_version != SJGDVERSION || gdesc.sjgd_groupoid != item->sjc_oid) { elog(NOTICE, "sjgroupvrfy: trashed cache"); return (SM_FAIL); } return (SM_SUCCESS); } int sjwrite(reln, blocknum, buffer) Relation reln; BlockNumber blocknum; char *buffer; { SJCacheItem *item; ObjectId reldbid; BlockNumber base; int offset; int grpno; int which; long seekpos; if (reln->rd_rel->relisshared) reldbid = (ObjectId) 0; else reldbid = MyDatabaseId; base = (blocknum / SJGRPSIZE) * SJGRPSIZE; item = _sjfetchgrp(reldbid, reln->rd_id, base, &grpno); /* shd expand _sjfetchgrp() inline to avoid extra semop()s */ SpinAcquire(SJCacheLock); which = blocknum % SJGRPSIZE; if (item->sjc_flags[which] & SJC_ONPLATTER) { SpinRelease(SJCacheLock); _sjunpin(item); elog(WARN, "sjwrite: optical platters are write-once, cannot rewrite"); } SET_IO_LOCK(item); item->sjc_flags[which] &= ~SJC_MISSING; /* verify group descriptor data in the cache file */ if (_sjgroupvrfy(item, grpno) == SM_FAIL) { _sjunpin(item); _sjunwait_io(item); return (SM_FAIL); } /* write the page */ seekpos = (grpno * SJBUFSIZE) + ((blocknum % SJGRPSIZE) * BLCKSZ) + JBBLOCKSZ; if (FileSeek(SJCacheVfd, seekpos, L_SET) != seekpos) { elog(NOTICE, "sjwrite: failed to seek to buffer lock (%d)", seekpos); _sjunpin(item); _sjunwait_io(item); return (SM_FAIL); } if (FileWrite(SJCacheVfd, buffer, BLCKSZ) != BLCKSZ) { elog(NOTICE, "sjwrite: can't read page %d", blocknum); _sjunwait_io(item); _sjunpin(item); return (SM_FAIL); } /* write the updated cache metadata entry */ seekpos = grpno * sizeof(*item); if (FileSeek(SJMetaVfd, seekpos, L_SET) != seekpos) { elog(NOTICE, "sjwrite: seek to %d on metadata file failed", seekpos); _sjunwait_io(item); _sjunpin(item); return (SM_FAIL); } if (FileWrite(SJMetaVfd, (char *) item, sizeof(*item)) < 0) { elog(NOTICE, "sjwrite: write of metadata file failed"); _sjunwait_io(item); _sjunpin(item); return (SM_FAIL); } _sjunwait_io(item); _sjunpin(item); return (SM_SUCCESS); } int sjflush(reln, blocknum, buffer) Relation reln; BlockNumber blocknum; char *buffer; { return (sjwrite(reln, blocknum, buffer)); } int sjblindwrt(dbstr, relstr, dbid, relid, blkno, buffer) char *dbstr; char *relstr; OID dbid; OID relid; BlockNumber blkno; char *buffer; { return (SM_FAIL); } /* * sjnblocks() -- Return the number of blocks that appear in this relation. * * Rather than compute this by walking through pg_plmap and fetching * groups off of platters, we store the number of blocks currently * allocated to a relation in a special Unix file. */ int sjnblocks(reln) Relation reln; { SJCacheTag tag; int nblocks; if (reln->rd_rel->relisshared) tag.sjct_dbid = (ObjectId) 0; else tag.sjct_dbid = MyDatabaseId; tag.sjct_relid = reln->rd_id; tag.sjct_base = (BlockNumber) _sjfindnblocks(&tag); return ((int) (tag.sjct_base)); } /* * _sjfindnblocks() -- Find block count for the (dbid,relid) pair. */ static int _sjfindnblocks(tag) SJCacheTag *tag; { int nbytes; int i; SJCacheTag *cachetag; SJCacheTag mytag; cachetag = SJNBlockCache; i = 0; while (i < SJNBLKSIZE && cachetag->sjct_relid != (ObjectId) 0) { if (cachetag->sjct_dbid == tag->sjct_dbid && cachetag->sjct_relid == tag->sjct_relid) { return (cachetag->sjct_base); } i++; cachetag++; } if (FileSeek(SJBlockVfd, 0L, L_SET) != 0) { elog(FATAL, "_sjfindnblocks: cannot seek to zero on block count file"); } while ((nbytes = FileRead(SJBlockVfd, (char *)&mytag, sizeof(mytag))) > 0) { if (mytag.sjct_dbid == tag->sjct_dbid && mytag.sjct_relid == tag->sjct_relid) { if (i == SJNBLKSIZE) { /* fast pseudo-random function */ i = mytag.sjct_relid % SJNBLKSIZE; cachetag = &(SJNBlockCache[i]); } /* save cache tag */ cachetag->sjct_dbid = mytag.sjct_dbid; cachetag->sjct_relid = mytag.sjct_relid; cachetag->sjct_base = mytag.sjct_base; return (mytag.sjct_base); } } elog(FATAL, "_sjfindnblocks: cannot get block count for <%d,%d>", tag->sjct_dbid, tag->sjct_relid); } /* * _sjregnblocks() -- Remember the count of blocks for this relid. */ static void _sjregnblocks(tag) SJCacheTag *tag; { int loc; int i; SJCacheTag *cachetag; SJCacheTag mytag; cachetag = SJNBlockCache; i = 0; while (i < SJNBLKSIZE && cachetag->sjct_relid != (ObjectId) 0) { if (cachetag->sjct_dbid == tag->sjct_dbid && cachetag->sjct_relid == tag->sjct_relid) break; i++; cachetag++; } if (i == SJNBLKSIZE) { i = tag->sjct_relid % SJNBLKSIZE; cachetag = &(SJNBlockCache[i]); } cachetag->sjct_dbid = tag->sjct_dbid; cachetag->sjct_relid = tag->sjct_relid; cachetag->sjct_base = tag->sjct_base; /* update block count file */ if (FileSeek(SJBlockVfd, 0L, L_SET) < 0) { elog(FATAL, "_sjregnblocks: cannot seek to zero on block count file"); } loc = 0; mytag.sjct_base = tag->sjct_base; /* overwrite existing entry, if any */ while (FileRead(SJBlockVfd, (char *) &mytag, sizeof(mytag)) > 0) { if (mytag.sjct_dbid == tag->sjct_dbid && mytag.sjct_relid == tag->sjct_relid) { if (FileSeek(SJBlockVfd, (loc * sizeof(SJCacheTag)), L_SET) < 0) elog(FATAL, "_sjregnblocks: cannot seek to loc"); if (FileWrite(SJBlockVfd, (char *) tag, sizeof(*tag)) < 0) elog(FATAL, "_sjregnblocks: cannot write nblocks"); return; } loc++; } /* new relation -- write at end of file */ if (FileWrite(SJBlockVfd, (char *) tag, sizeof(*tag)) < 0) elog(FATAL, "_sjregnblocks: cannot write nblocks for new reln"); } int sjcommit() { FileSync(SJMetaVfd); FileSync(SJCacheVfd); FileSync(SJBlockVfd); return (SM_SUCCESS); } int sjabort() { return (SM_SUCCESS); } /* * SJShmemSize() -- Declare amount of shared memory we require. * * The shared memory initialization code creates a block of shared * memory exactly big enough to hold all the structures it needs to. * This routine declares how much space the Sony jukebox cache will * use. */ int SJShmemSize() { int size; int nbuckets; int nsegs; int tmp; /* size of cache metadata */ size = ((SJCACHESIZE + 1) * sizeof(SJCacheItem)) + sizeof(SJCacheHeader); #ifndef HAS_TEST_AND_SET size += sizeof(*SJNWaiting); #endif /* ndef HAS_TEST_AND_SET */ /* size of hash table */ nbuckets = 1 << (int)my_log2((SJCACHESIZE - 1) / DEF_FFACTOR + 1); nsegs = 1 << (int)my_log2((nbuckets - 1) / DEF_SEGSIZE + 1); size += my_log2(SJCACHESIZE) + sizeof(HHDR); size += nsegs * DEF_SEGSIZE * sizeof(SEGMENT); tmp = (int)ceil((double)SJCACHESIZE/BUCKET_ALLOC_INCR); size += tmp * BUCKET_ALLOC_INCR * (sizeof(BUCKET_INDEX) + sizeof(SJHashEntry)); /* nblock cache */ size += SJNBLKSIZE * sizeof(SJCacheTag); /* count shared memory required for jukebox state */ size += JBShmemSize(); return (size); } /* * sjmaxseg() -- Find highest segment number occupied by platter id plid * in the on-disk cache. * * This routine is called from _pgjb_findoffset(). On entry here, * we hold JBSpinLock, but not SJCacheLock. We do something a little * dangerous here; we trust the group descriptor metadata that is in * shared memory to reflect accurately the state of the actual cache * file. This isn't so bad; if there's an inconsistency, there are * exactly two possibilities: * * + There was a crash between metadata and cache update, * and we'll figure that out later; * * + Some other backend has IO_IN_PROG set on the group we * are examining, and we need to look at the group desc * on disk in order to find out if the group is on plid. * * The second case basically means that we wind up holding SJCacheLock * during a disk io, but that's a sufficiently rare event that we don't * care. I can't think of any cleaner way to do this, anyway. * * We return the address of the first block of the highest-numbered * extent that we have cached for plid. If we have none cached, we * return InvalidBlockNumber. */ BlockNumber sjmaxseg(plid) ObjectId plid; { int i; long seekpos, loc; int nbytes; BlockNumber last; SJGroupDesc *group; /* XXX hold the lock for a walk of the entire cache */ SpinAcquire(SJCacheLock); last = InvalidBlockNumber; group = (SJGroupDesc *) &(SJCacheBuf[0]); /* * Walk backwards along the free list. If we ever hit an unallocated * block, we can stop searching. Otherwise, we'll hit the head of the * list when freeprev == -1. */ for (i = SJHeader->sjh_freetail; i != -1 && SJCache[i].sjc_oid != InvalidObjectId; i = SJCache[i].sjc_freeprev) { /* if IO_IN_PROG is set, we need to look at the group desc on disk */ if (SJCache[i].sjc_gflags & SJC_IOINPROG) { seekpos = i * SJBUFSIZE; if ((loc = FileSeek(SJCacheVfd, seekpos, L_SET)) != seekpos) { SpinRelease(SJCacheLock); elog(NOTICE, "sjmaxseg: cannot seek"); return (-1); } nbytes = FileRead(SJCacheVfd, (char *) group, sizeof(SJGroupDesc)); if (nbytes != sizeof(SJGroupDesc)) { SpinRelease(SJCacheLock); elog(NOTICE, "sjmaxseg: read of group desc %d failed", i); return (-1); } /* sanity checks */ if (group->sjgd_magic != SJGDMAGIC || group->sjgd_version != SJGDVERSION) { elog(FATAL, "sjmaxseg: cache file corrupt."); } if (group->sjgd_plid == plid) { if (group->sjgd_jboffset > last || last == InvalidBlockNumber) last = group->sjgd_jboffset; } } else { if (SJCache[i].sjc_plid == plid) { if (SJCache[i].sjc_jboffset > last || last == InvalidBlockNumber) { last = SJCache[i].sjc_jboffset; } } } } SpinRelease(SJCacheLock); return (last); } static void _sjdump() { int i, j; int nentries; SJCacheItem *item; SpinAcquire(SJCacheLock); nentries = SJHeader->sjh_nentries; printf("jukebox cache metdata: size %d, %d entries, free head %d tail %d", SJCACHESIZE, nentries, SJHeader->sjh_freehead, SJHeader->sjh_freetail); if (SJHeader->sjh_flags & SJH_INITING) printf(", INITING"); if (SJHeader->sjh_flags & SJH_INITED) printf(", INITED"); printf("\n"); for (i = 0; i < SJCACHESIZE; i++) { item = &SJCache[i]; printf(" [%2d] <%ld,%ld,%ld> %d@@%d next %d prev %d flags %s oid %ld\n", i, item->sjc_tag.sjct_dbid, item->sjc_tag.sjct_relid, item->sjc_tag.sjct_base, item->sjc_plid, item->sjc_jboffset, item->sjc_freenext, item->sjc_freeprev, (item->sjc_gflags & SJC_IOINPROG ? "IO_IN_PROG" : "CLEAR"), item->sjc_oid); printf(" "); for (j = 0; j < SJGRPSIZE; j++) { printf("[%d %c%c]", j, (item->sjc_flags[j] & SJC_MISSING ? 'm' : '-'), (item->sjc_flags[j] & SJC_ONPLATTER ? 'o' : '-')); } printf("\n"); } SpinRelease(SJCacheLock); } /* * SJInitSemaphore() -- Initialize the 'wait' semaphore for jukebox cache * pages. * * We only do this if we don't have test-and-set locks. */ SJInitSemaphore(key) IPCKey key; { #ifndef HAS_TEST_AND_SET int status; SJWaitSemId = IpcSemaphoreCreate(IPCKeyGetSJWaitSemaphoreKey(key), 1, IPCProtection, 0, &status); if (SJWaitSemId < 0) { elog(FATAL, "cannot create/attach jukebox semaphore"); } #else /* ndef HAS_TEST_AND_SET */ return; #endif /* ndef HAS_TEST_AND_SET */ } #endif /* SONY_JUKEBOX */ @ 1.34 log @blocks allocated to platters according to a more sensible policy @ text @d39 1 a39 1 RcsId("$Header: /private/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.33 1992/05/28 17:09:35 mao Exp mao $"); d757 1 a757 1 plmdata->plid = plattup->t_oid; @ 1.33 log @checkin to sync up for testing -- changed a comment describing our allocation strategy, but not the actual strategy. need to get back tot his. @ text @d39 1 a39 1 RcsId("$Header: /private/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.32 1992/01/29 21:32:50 mao Exp mao $"); d125 5 a129 3 extern HTAB *ShmemInitHash(); extern int *ShmemInitStruct(); extern Relation RelationIdGetRelation(); d696 2 a697 1 * platter <= 2/3 full. d703 4 a706 1 * allocate it there. d708 2 a709 1 * - Otherwise, allocate the extent on any platter <= 2/3 full. d727 2 d735 8 d745 5 a750 1 d754 22 a775 7 /* get platter OID, name */ plmdata->plid = plattup->t_oid; d = (Datum) heap_getattr(plattup, buf, Anum_pg_platter_plname, platdesc, &isnull); platname = DatumGetName(d); strncpy(plname, &(platname->data[0]), sizeof(NameData)); plname[sizeof(NameData)] = '\0'; a776 5 /* done */ ReleaseBuffer(buf); heap_endscan(platscan); heap_close(plat); d783 6 @ 1.32 log @pass address of spinlock, not spinlock itself @ text @d39 1 a39 1 RcsId("$Header: /n/hermes/usr5/postgres/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.31 1991/11/14 19:40:44 kemnitz Exp mao $"); d691 12 a702 2 * For now, this makes a really stupid choice. Need to think about * the right way to go about this. @ 1.31 log @protos checkin. @ text @d39 1 a39 1 RcsId("$Header: RCS/sj.c,v 1.30 91/11/08 20:18:35 mao Exp Locker: kemnitz $"); d89 1 a89 1 S_LOCK(item->sjc_iolock); @ 1.30 log @file mode on open is 0600 @ text @d39 1 a39 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.29 1991/11/07 06:05:53 mao Exp mao $"); a126 1 extern int tag_hash(); d326 1 a326 1 nread = FileRead(SJMetaVfd, SJCache, nbytes); d1520 1 a1520 1 entry = (SJHashEntry *) hash_search(SJCacheHT, tagP, op, foundP); d1646 1 a1646 1 if (FileRead(SJCacheVfd, &gdesc, sizeof(gdesc)) < 0) { d1823 1 a1823 1 while ((nbytes = FileRead(SJBlockVfd, &mytag, sizeof(mytag))) > 0) { d1888 1 a1888 1 while (FileRead(SJBlockVfd, &mytag, sizeof(mytag)) > 0) { @ 1.29 log @at initialization time, free list head points to first unallocated extent in the cache. also, walk backwards along free list when computing first unallocated extent on a platter. @ text @d39 1 a39 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.28 1991/10/29 06:34:27 mao Exp mao $"); d1555 1 a1555 1 fd = FileNameOpenFile(path, O_RDWR, 0666); @ 1.28 log @remove debugging code @ text @d39 1 a39 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.27 1991/10/29 06:33:22 mao Exp mao $"); d342 1 a342 6 /* * Add every group that appears in the cache to the hash table. Since * we have no references to any of these groups yet, they all appear on * the free list. */ a349 8 /* link up free list -- no info yet, so just link groups in order */ cur->sjc_freeprev = i - 1; if (i == SJCACHESIZE - 1) { cur->sjc_freenext = -1; } else { cur->sjc_freenext = i + 1; } d360 7 a366 2 * Put the rest of the cache entries on the free list, marking them as * missing by setting the oid entry to InvalidObjectId. d369 1 a369 1 for (i = nentries; i < SJCACHESIZE; i++) { a370 1 cur->sjc_oid = InvalidObjectId; d372 1 d378 39 a420 2 SJHeader->sjh_freehead = 0; SJHeader->sjh_freetail = SJCACHESIZE - 1; d2005 9 a2013 1 for (i = 0; i < SJHeader->sjh_nentries; i++) { @ 1.27 log @be polite -- allocate memory before you write on it. @ text @d39 1 a39 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.26 1991/10/29 04:12:35 mao Exp $"); a52 2 static SJCacheTag *DebugBlockEnd; /* pointer to nblock cache */ int *MaoDebugInt = 0x10072000; a194 1 DebugBlockEnd = (SJCacheTag *) cacheblk; a1805 1 if (cachetag >= DebugBlockEnd) _punt(); a1846 1 if (cachetag >= DebugBlockEnd) _punt(); a2083 5 _punt() { elog(NOTICE, "found it"); } @ 1.26 log @fix nblock cache code -- compilation failed @ text @d39 1 a39 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.25 1991/10/29 00:11:52 mao Exp $"); d53 2 d165 2 a166 1 metasize = (SJCACHESIZE * sizeof(SJCacheItem)) + sizeof(SJCacheHeader); d169 1 a169 1 + sizeof(*SJNWaiting); d197 1 d1809 1 d1835 1 a1835 1 cachetag = &(SJNBlockCache[0]); d1851 1 d2089 5 @ 1.25 log @fix up some botches in cache management @ text @d39 1 a39 1 RcsId("$Header: RCS/sj.c,v 1.24 91/10/04 17:52:59 mao Exp Locker: mao $"); d1782 2 a1783 2 if (cachetag->sjct_dbid == tag.sjct_dbid && cachetag->sjct_relid == tag.sjct_relid) { d1830 1 a1830 1 cachetag = SJNBlockCache; d1833 2 a1834 2 if (cachetag->sjct_dbid == tag.sjct_dbid && cachetag->sjct_relid == tag.sjct_relid) d1842 2 a1843 2 i = tag.sjct_relid % SJNBLKSIZE; cachetag = &(SJCacheTag[i]); d1846 3 a1848 3 cachetag->sjct_dbid = tag.sjct_dbid; cachetag->sjct_relid = tag.sjct_relid; cachetag->sjct_base = tag.sjct_base; @ 1.24 log @starting to optimize -- cut way back on the amount of io we do for reads and writes on the mag disk cache. @ text @d39 1 a39 1 RcsId("$Header: /local/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.23 1991/10/03 15:07:32 mao Exp mao $"); d52 1 d179 1 a179 1 * and cache entries. d190 3 d337 5 d1775 2 d1779 11 d1797 12 d1826 2 d1830 20 d1922 3 @ 1.23 log @on create, release the cache lock before returning. oops. @ text @d39 1 a39 1 RcsId("$Header: /local/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.22 1991/10/03 00:56:55 mao Exp $"); d115 1 d1212 1 d1269 16 a1284 1 if (_sjreadgrp(item, grpno) == SM_FAIL) { d1289 6 a1294 2 offset = (grpoffset * BLCKSZ) + JBBLOCKSZ; bcopy(buffer, &(SJCacheBuf[offset]), BLCKSZ); d1296 2 a1297 9 /* * It's the highest-numbered block in this relation, and it's not on * the platter yet. * * NOTE: by doing this, we've just changed the number of blocks in the * relation. We need to hold the extend lock on this reln until end * of transaction, since no one will be able to see the new block until * then. */ d1299 6 a1304 1 item->sjc_flags[grpoffset] |= SJC_CLEAR; d1306 2 a1307 2 /* finally, write out the extent with the new block in it */ if (_sjwritegrp(item, grpno) == SM_FAIL) { d1309 1 d1313 1 d1541 1 d1563 13 a1575 1 if (_sjreadgrp(item, grpno) == SM_FAIL) { d1580 6 a1585 2 offset = ((blocknum % SJGRPSIZE) * BLCKSZ) + JBBLOCKSZ; bcopy(&(SJCacheBuf[offset]), buffer, BLCKSZ); d1593 31 d1636 1 d1658 2 d1662 16 a1677 1 SET_IO_LOCK(item); d1679 2 a1680 1 if (_sjreadgrp(item, grpno) == SM_FAIL) { d1686 9 a1694 2 offset = (which * BLCKSZ) + JBBLOCKSZ; bcopy(buffer, &(SJCacheBuf[offset]), BLCKSZ); d1696 2 a1697 1 if (_sjwritegrp(item, grpno) == SM_FAIL) { @ 1.22 log @cleanup and bug fixes -- wisconsin benchmark now works for jukebox relations @ text @d39 1 a39 1 RcsId("$Header: /local/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.21 1991/09/28 20:04:03 mao Exp mao $"); d556 1 d559 3 a561 4 * By here, cache is initialized and we have exclusive access to * metadata. We are aggressively lazy, and will not allocate an * initial extent for this relation until it's actually used. We * just register an initial block count of zero. @ 1.21 log @checking in in order to sync up and get a new tree; this version fixes many bugs, but still contains a bunch of debugging code, and should not be shipped. @ text @d39 1 a39 1 RcsId("$Header: /local/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.20 1991/09/11 07:19:37 mao Exp mao $"); d132 6 a137 8 * in shared memory and open the cache on mag disk. If this code is * executed by the postmaster, we'll create (but not populate) the * cache memory. The first backend to run that touches the cache * initializes it. All other backends running simultaneously will * only wait for this initialization to complete if they need to get * data out of the cache. Otherwise, they'll return successfully * immediately after attaching the cache memory, and will let their * older sibling do all the work. d155 1 a155 1 * the d800 8 a807 1 /* if there are no writes to force to the jukebox, we're done */ d809 9 a817 5 if (!(item->sjc_gflags & SJC_DIRTY)) { for (i = 0; i < SJGRPSIZE; i++) { if (item->sjc_flags[i] & SJC_DIRTY) { dirty = true; break; a819 2 } else { dirty = true; d851 1 a851 1 if (item->sjc_flags[i] & SJC_DIRTY) { d859 1 a859 17 /* if necessary, put the highest block in the relation on mag disk */ if ((item->sjc_tag.sjct_base + SJGRPSIZE + 1) >= nblocks) { grpoffset = ((nblocks - 1) % SJGRPSIZE); if (item->sjc_flags[grpoffset] & SJC_DIRTY) { /* COMPLETELY bogus. Won't work with any sort of sharing. */ reln = RelationIdGetRelation(item->sjc_tag.sjct_relid); loc = FileSeek(reln->rd_fd, 0L, L_SET); where = JBBLOCKSZ + ((nblocks - 1) * BLCKSZ); FileWrite(reln->rd_fd, &(SJCacheBuf[where]), BLCKSZ); item->sjc_flags[grpoffset] &= ~SJC_DIRTY; } } if (pgjb_wrtextent(item, &(SJCacheBuf[0])) == SM_FAIL) { a1056 15 /* XXX debug */ { int i; char *p; p = &(SJCacheBuf[JBBLOCKSZ]); for (i = 0; i < SJGRPSIZE; i++) { if (!(item->sjc_flags[i] & SJC_MISSING)) _sjbuftrap(item->sjc_tag.sjct_base + i, p); p += BLCKSZ; } } a1177 15 /* XXX debug */ { int i; char *p; p = &(SJCacheBuf[JBBLOCKSZ]); for (i = 0; i < SJGRPSIZE; i++) { if (!(item->sjc_flags[i] & SJC_MISSING)) _sjbuftrap(item->sjc_tag.sjct_base + i, p); p += BLCKSZ; } } a1253 5 * * The check of DIRTY and ONPLATTER in case of not MISSING is to handle * the case where some other backend started to do the extend, then * aborted. In fact, this is probably an error, and the code to handle * it may not work correctly; should think more about this. d1257 2 a1258 5 if (item->sjc_flags[grpoffset] & SJC_DIRTY || item->sjc_flags[grpoffset] & SJC_ONPLATTER) { SpinRelease(SJCacheLock); elog(WARN, "sjextend: cache botch: next block in group present"); } d1276 2 a1277 14 * It's the highest-numbered block in this relation, and it's dirty, * now. NOTE: by doing this, we've just changed the number of blocks * in the relation. We need to hold the extend lock on this reln * until end of transaction, since no one will be able to see the new * block until then. */ item->sjc_flags[grpoffset] |= SJC_DIRTY; /* * Since we just added a new block to the relation, the old highest- * numbered block is about to become a candidate for movement to the * optical disk jukebox. Until now, it's been cached on magnetic * disk. We need to mark it dirty. d1279 4 a1282 5 * There are two possibilities: if the old block is in the same * extent as the new block, then we can just mark it dirty directly, * since we have that group already. If this is a brand-new extent, * then we need to instantiate the extent that precedes it, and mark * the highest-numbered block in that extent dirty. d1285 1 a1285 24 if (grpoffset == 0) { /* * Hard case -- we just allocated a new extent. We need to * instantiate the previous extent and mark the block dirty * there. This is complicated enough to wrap up in a separate * routine. */ if (nblocks > 0) _sjdirtylast(tag.sjct_dbid, tag.sjct_relid, nblocks - 1); } else { /* * Easy case -- old block is in this extent. Decrement the * offset and mark the block dirty. It is bad news if the * old highest-numbered block is on a platter or missing; these * should never happen. */ grpoffset--; if (item->sjc_flags[grpoffset] & SJC_MISSING || item->sjc_flags[grpoffset] & SJC_ONPLATTER) { a1286 7 elog(WARN, "sjextend: old 'last block' not writable"); } /* okay, mark it dirty */ item->sjc_flags[grpoffset] |= SJC_DIRTY; } a1301 76 /* * _sjdirtyblock() -- Mark the requested block in a relation dirty. * * When we extend a relation, it gets a new last block. The last * block of every relation is always stored on magnetic disk, so * when we do an extend, we need to mark the old last block dirty. * This will guarantee that it gets kicked out to the optical * platter later, and that the new last block can be safely written * to the magnetic disk file for caching the relation's last block. */ static void _sjdirtylast(dbid, relid, blkno) ObjectId dbid; ObjectId relid; int blkno; { OffsetNumber base; int grpno; int i; long seekpos; long loc; int nbytes; char *buf; int which; SJCacheItem *item; base = ((blkno / SJGRPSIZE) * SJGRPSIZE); which = (blkno % SJGRPSIZE); item = _sjfetchgrp(dbid, relid, base, &grpno); SpinAcquire(SJCacheLock); SET_IO_LOCK(item); /* mark it dirty */ if ((item->sjc_flags[which] & SJC_MISSING) || (item->sjc_flags[which] & SJC_ONPLATTER)) { _sjunwait_io(item); _sjunpin(item); elog(WARN, "_sjdirtyblock: old 'last block' not writable"); } item->sjc_flags[which] |= SJC_DIRTY; /* just need to update the metadata file */ seekpos = grpno * sizeof(*item); if ((loc = FileSeek(SJMetaVfd, seekpos, L_SET)) != seekpos) { _sjunwait_io(item); _sjunpin(item); elog(WARN, "_sjdirtyblock: cache metadata file seek failed"); } nbytes = sizeof(*item); buf = (char *) item; while (nbytes > 0) { i = FileWrite(SJMetaVfd, buf, nbytes); if (i < 0) { _sjunwait_io(item); _sjunpin(item); elog(WARN, "_sjdirtyblock: cache metadata file write failed"); } nbytes -= i; buf += i; } _sjunwait_io(item); _sjunpin(item); FileSync(SJMetaVfd); } a1341 15 /* XXX debug */ { int i; char *p; p = &(SJCacheBuf[JBBLOCKSZ]); for (i = 0; i < SJGRPSIZE; i++) { if (!(item->sjc_flags[i] & SJC_MISSING)) _sjbuftrap(item->sjc_tag.sjct_base + i, p); p += BLCKSZ; } } d1385 1 a1385 1 item->sjc_gflags = SJC_DIRTY; a1548 2 _sjbuftrap(blocknum, buffer); a1567 2 _sjbuftrap(blocknum, buffer); a1588 1 item->sjc_flags[which] |= SJC_DIRTY; d1647 1 d1656 1 a1656 1 _sjfindnblocks(&tag); d1679 1 a1679 2 tag->sjct_base = mytag.sjct_base; return; d1894 1 a1894 2 printf("[%d %c%c%c]", j, (item->sjc_flags[j] & SJC_DIRTY ? 'd' : '-'), a1926 21 #include "storage/bufpage.h" _sjbuftrap(blkno, page) OffsetNumber blkno; Page page; { HeapTuple htup; if (PageIsEmpty(page) || ((PageHeader) page)->pd_lower == 0) return; htup = (HeapTuple) PageGetItem(page, PageGetItemId(page, 0)); if (ItemPointerGetBlockNumber(&(htup->t_ctid)) != blkno) _sjtrap(); } _sjtrap() { elog(NOTICE, "got that puppy"); } @ 1.20 log @flushes to platters sort of working; sometimes we get a small hole in an extent. need to try to figure out what is going on in pgjb_wrtextent, in the case where we parcel up the write into pieces. @ text @d39 1 a39 1 RcsId("$Header: /local/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.19 1991/09/10 23:27:19 mao Exp mao $"); d111 1 d850 2 d853 2 a854 2 if ((item->sjc_tag.sjct_base + SJGRPSIZE) >= nblocks) { grpoffset = (nblocks % SJGRPSIZE) - 1; d907 1 a907 1 return (_sjfetchgrp(dbid, relid, blkno)); d1066 15 d1202 4 a1205 9 nbytes = SJBUFSIZE; buf = &(SJCacheBuf[0]); while (nbytes > 0) { i = FileWrite(SJCacheVfd, buf, nbytes); if (i < 0) return (SM_FAIL); nbytes -= i; buf += i; } d1207 1 a1207 1 FileSync(SJCacheVfd); d1209 3 a1211 2 return (SM_SUCCESS); } d1213 2 a1214 16 static int _sjreadgrp(item, grpno) SJCacheItem *item; int grpno; { long seekpos; long loc; int nbytes, i; char *buf; SJGroupDesc *gdesc; /* get the group from the cache file */ seekpos = grpno * SJBUFSIZE; if ((loc = FileSeek(SJCacheVfd, seekpos, L_SET)) != seekpos) { elog(NOTICE, "_sjreadgrp: cannot seek"); return (SM_FAIL); d1220 2 a1221 3 i = FileRead(SJCacheVfd, buf, nbytes); if (i < 0) { elog(NOTICE, "_sjreadgrp: read failed"); a1222 1 } d1227 1 a1227 9 gdesc = (SJGroupDesc *) &(SJCacheBuf[0]); if (gdesc->sjgd_magic != SJGDMAGIC || gdesc->sjgd_version != SJGDVERSION || gdesc->sjgd_groupoid != item->sjc_oid) { elog(NOTICE, "_sjreadgrp: trashed cache"); return (SM_FAIL); } a1231 7 int sjunlink(reln) Relation reln; { return (SM_FAIL); } d1329 1 d1332 46 d1393 141 d1730 2 d1751 2 d2114 21 @ 1.19 log @get rid of some curiosities in the block and extent allocation code. we are now agressively lazy, and will not allocate even the initial extent for a relation until it's needed. this simplifies the code a lot. @ text @d39 1 a39 1 RcsId("$Header: /local/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.18 1991/09/10 06:41:50 mao Exp $"); d732 4 d737 2 a738 2 * If the cache is full, we call a routine to get rid of the least * recently used group. d741 1 a741 4 if (SJHeader->sjh_nentries == SJCACHESIZE) elog(FATAL, "_sjallocgrp: no groups on free list!"); else *grpno = _sjgetgrp(); a744 3 /* bump ref count */ _sjtouch(item); d759 13 d787 4 d832 17 a849 2 nblocks = _sjfindnblocks(&(item->sjc_tag)); d954 1 a954 1 /* read the extent */ d957 4 a960 1 /* release IO lock */ d1000 1 a1000 1 ScanKeyEntryInitialize(&skey[0], 0x0, Anum_pg_plmap_plblkno, d1234 1 d1249 4 d1419 1 a1419 1 group->sjgd_relblkno = 0; @ 1.18 log @work on cache management -- allocate extents properly @ text @d39 1 a39 1 RcsId("$Header: /local/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.17 1991/09/09 23:58:55 mao Exp mao $"); d532 3 d560 3 a562 1 * metadata. Allocate an initial (empty) extent in the cache. d565 9 a573 1 _sjnewextent(reln, 0); d972 8 d981 4 a984 4 _sjunwait_io(item); elog(WARN, "_sjrdextent: cannot find <%d,%d,%d>", item->sjc_tag.sjct_dbid, item->sjc_tag.sjct_relid, item->sjc_tag.sjct_base); d1130 1 a1226 1 int blkno; d1242 1 a1242 2 if (((nblocks + 1) % SJGRPSIZE) == 0) { base += SJGRPSIZE; d1262 1 d1265 1 a1278 1 grpoffset = nblocks % SJGRPSIZE; d1298 1 a1298 1 offset = ((blkno % SJGRPSIZE) * BLCKSZ) + JBBLOCKSZ; d1492 6 @ 1.17 log @use GetPGHome instead of using getenv directly @ text @d39 1 a39 1 RcsId("$Header: /local/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.16 1991/09/05 23:26:02 hong Exp mao $"); d1791 1 a1791 1 if (group->sjgd_jboffset > last) d1796 3 a1798 1 if (SJCache[i].sjc_jboffset > last) d1800 1 @ 1.16 log @fix a bug in shared memory size calculation @ text @a7 2 #include d13 1 d17 2 d39 1 a39 1 RcsId("$Header: RCS/sj.c,v 1.15 91/08/22 06:33:09 mao Exp Locker: mao $"); a124 1 extern char *getenv(); d231 1 a231 3 if ((pghome = getenv("POSTGRESHOME")) == (char *) NULL) pghome = "/usr/postgres"; @ 1.15 log @bug fixes to code that handles flushing, fetching bytes from the jukebox @ text @d15 1 d38 1 a38 1 RcsId("$Header: /local/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.14 1991/08/13 22:00:30 mao Exp $"); d1702 1 d1713 5 a1717 4 size += my_log2(SJCACHESIZE) + sizeof(HHDR) + nsegs * DEF_SEGSIZE * sizeof(SEGMENT) + (int)ceil((double)SJCACHESIZE/BUCKET_ALLOC_INCR)*BUCKET_ALLOC_INCR* (sizeof(BUCKET_INDEX) + sizeof(SJHashEntry)); @ 1.14 log @separate routine now initializes jukebox wait semaphore; this is to permit the postmaster to find this semaphore on shutdown @ text @d37 1 a37 1 RcsId("$Header: /local/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.13 1991/08/08 05:53:28 mao Exp $"); a49 1 static SJNBlock *SJNBlockList; /* linked list of nblocks by relid */ a281 3 /* haven't computed block counts for any relations yet */ SJNBlockList = (SJNBlock *) NULL; d382 1 a382 1 SJHeader->sjh_freetail = SJCACHESIZE; d694 1 a694 1 bcopy(&(item->sjc_plname.data[0]), plname, sizeof(NameData)); d806 1 a806 1 nblocks = sjnblocks(&(item->sjc_tag)); d906 1 a906 1 bcopy((char *) &(item->sjc_tag), (char *) &tag, sizeof(tag)); a1213 8 if (reln->rd_rel->relisshared) tag.sjct_dbid = (ObjectId) 0; else tag.sjct_dbid = MyDatabaseId; tag.sjct_relid = reln->rd_id; tag.sjct_base = base; d1223 1 d1228 8 d1469 1 d1478 1 a1478 1 item = _sjfetchgrp(reldbid, reln->rd_id, blocknum / SJGRPSIZE, &grpno); d1480 2 d1509 1 d1519 3 a1521 1 item = _sjfetchgrp(reldbid, reln->rd_id, blocknum / SJGRPSIZE, &grpno); a1614 1 SJNBlock *l; a1617 11 /* see if we already computed the block count */ l = SJNBlockList; while (l != (SJNBlock *) NULL) { if (l->sjnb_relid == tag->sjct_relid && l->sjnb_dbid == tag->sjct_dbid) return (l->sjnb_nblocks); l = l->sjnb_next; } /* nope, need to do some work */ a1642 1 SJNBlock *l; a1644 23 l = SJNBlockList; /* overwrite old value, if one exists */ while (l != (SJNBlock *) NULL) { if (l->sjnb_relid == tag->sjct_relid && l->sjnb_dbid == tag->sjct_dbid) { l->sjnb_nblocks = (int) tag->sjct_base; break; } l = l->sjnb_next; } /* otherwise, allocate new slot and write new value */ if (l == (SJNBlock *) NULL) { l = (SJNBlock *) palloc(sizeof(SJNBlock)); l->sjnb_relid = tag->sjct_relid; l->sjnb_dbid = tag->sjct_dbid; l->sjnb_nblocks = (int) tag->sjct_base; l->sjnb_next = SJNBlockList; SJNBlockList = l; } d1659 1 a1659 1 if (FileWrite(SJBlockVfd, (char *) &mytag, sizeof(mytag)) < 0) d1667 1 a1667 4 mytag.sjct_dbid = tag->sjct_dbid; mytag.sjct_relid = tag->sjct_relid; if (FileWrite(SJBlockVfd, (char *) &mytag, sizeof(mytag)) < 0) d1673 3 a1675 2 /* XXX should free the list, but it's in the wrong mcxt */ SJNBlockList = (SJNBlock *) NULL; a1682 3 /* XXX should free the list, but it's in the wrong mcxt */ SJNBlockList = (SJNBlock *) NULL; @ 1.13 log @simple jukebox interactions work correctly. @ text @d37 1 a37 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.12 1991/08/06 08:09:21 mao Exp mao $"); a138 4 * * The 'key' argument is the IPC key used in this backend (or postmaster) * for initializing shared memory and semaphores. Since we need a * wait lock, we need this. d142 1 a142 2 sjinit(key) IPCKey key; a207 14 #ifndef HAS_TEST_AND_SET /* * Finally, we need the wait semaphore if this system does not support * test-and-set locks. */ SJWaitSemId = IpcSemaphoreCreate(IPCKeyGetSJWaitSemaphoreKey(key), 1, IPCProtection, 0, &status); if (SJWaitSemId < 0) { SpinRelease(SJCacheLock); return (SM_FAIL); } #endif /* ndef HAS_TEST_AND_SET */ d1883 23 @ 1.12 log @MyDatabaseName, MyDatabaseId are extern, not static @ text @d37 1 a37 1 RcsId("$Header: RCS/sj.c,v 1.11 91/08/06 01:41:44 mao Exp Locker: mao $"); d1719 3 @ 1.11 log @real jukebox support is in, but is untested (initialization still works) @ text @d37 1 a37 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.10 1991/08/03 00:29:18 mao Exp mao $"); d41 2 a42 2 static ObjectId MyDatabaseId; /* OID of database we have open */ static Name MyDatabaseName; /* name of database we have open */ @ 1.10 log @add (some) real jukebox calls @ text @d27 1 d37 1 a37 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.9 1991/07/29 16:52:28 mer Exp mao $"); d46 1 d71 2 a72 2 /* static buffer is for data transfer -- SJGRPSIZE blocks + descriptor block */ static char SJCacheBuf[(BLCKSZ * SJGRPSIZE) + JBBLOCKSZ]; d74 21 a94 2 /* used in sj.c, pgjb.c */ int SJBufSize = ((BLCKSZ * SJGRPSIZE) + JBBLOCKSZ); d96 2 d108 2 d116 2 d125 1 d286 16 d356 6 a361 1 /* add each entry to the hash table, and set up link pointers */ d364 1 a364 27 result = (SJHashEntry *) hash_search(SJCacheHT, &(cur->sjc_tag), HASH_ENTER, &found); /* * If the hash table is corrupted, or the entry is already in the * table, then we're in trouble and need to surrender. When we * release our initialization lock on the cache metadata, someone * else may come along later and try to reinitialize it. They'll * fail, too, since we leave things trashed here. Rather than try * to clean up, however, we assume that failing fast is the right * answer. Since this is catastrophic, other backends probably * *should* fail. */ if (result == (SJHashEntry *) NULL) { SJHeader->sjh_flags &= ~SJH_INITING; _sjunwait_init(); elog(FATAL, "sj cache hash table corrupted"); } if (found) { SJHeader->sjh_flags &= ~SJH_INITING; _sjunwait_init(); elog(FATAL, "duplicate group in sj cache file: <%d,%d,%d>", cur->sjc_tag.sjct_dbid, cur->sjc_tag.sjct_relid, cur->sjc_tag.sjct_base); } d369 7 a375 6 /* link up lru list -- no info yet, so just link groups in order */ cur->sjc_lruprev = i - 1; if (i == nentries - 1) cur->sjc_lrunext = -1; else cur->sjc_lrunext = i + 1; d386 4 a389 2 /* set up cache metadata header struct */ SJHeader->sjh_nentries = nentries; d391 10 a400 4 if (nentries > 0) SJHeader->sjh_lruhead = 0; else SJHeader->sjh_lruhead = -1; d402 4 a405 1 SJHeader->sjh_lrutail = nentries - 1; d446 1 a446 1 * sjunwait_io() -- Release IO lock on the jukebox cache. d455 2 a456 2 void sjunwait_io(item) a555 1 SJHashEntry *entry; a557 1 bool found; d581 1 a581 1 * metadata. Allocate a group in the cache. d584 1 a584 1 item = _sjallocgrp(&grpno); d586 1 a587 89 item->sjc_tag.sjct_dbid = (ObjectId) 0; else item->sjc_tag.sjct_dbid = MyDatabaseId; item->sjc_tag.sjct_relid = (ObjectId) reln->rd_id; item->sjc_tag.sjct_base = (BlockNumber) 0; entry = (SJHashEntry *) hash_search(SJCacheHT, item, HASH_ENTER, &found); if (entry == (SJHashEntry *) NULL) { SpinRelease(SJCacheLock); elog(FATAL, "jukebox cache hash table corrupt."); } else if (found) { SpinRelease(SJCacheLock); elog(FATAL, "Attempt to create existing relation -- impossible"); } entry->sjhe_groupno = grpno; item->sjc_gflags |= SJC_IOINPROG; #ifdef HAS_TEST_AND_SET SpinRelease(SJCacheLock); S_LOCK(item->sjc_iolock); #else /* HAS_TEST_AND_SET */ (*SJNWaiting)++; SpinRelease(SJCacheLock); IpcSemaphoreLock(SJWaitSemId, 0, 1); #endif /* HAS_TEST_AND_SET */ /* set flags on item, initialize group descriptor block */ item->sjc_gflags = SJC_DIRTY; for (i = 0; i < SJGRPSIZE; i++) item->sjc_flags[i] = SJC_MISSING; /* should be smarter and only bzero what we need to */ bzero(SJCacheBuf, SJBufSize); group = (SJGroupDesc *) (&SJCacheBuf[0]); group->sjgd_magic = SJGDMAGIC; group->sjgd_version = SJGDVERSION; if (reln->rd_rel->relisshared) { group->sjgd_dbid = (ObjectId) 0; } else { strncpy(&(group->sjgd_dbname.data[0]), &(MyDatabaseName->data[0]), sizeof(NameData)); group->sjgd_dbid = (ObjectId) MyDatabaseId; } strncpy(&(group->sjgd_relname.data[0]), &(reln->rd_rel->relname.data[0]), sizeof(NameData)); group->sjgd_relid = reln->rd_id; group->sjgd_relblkno = 0; item->sjc_oid = group->sjgd_groupoid = newoid(); /* * Record the presence of the new extent in the system catalogs. The * plid, jboffset, and extentsz fields are filled in by _sjregister() * or the routines that it calls. Note that we do not force the new * group descriptor block all the way to the optical platter here. * We do decide where to place it, however, and must go to a fair amount * of trouble elsewhere in the code to avoid allocating the same extent * to a different relation, or block within the same relation. */ _sjregister(item, group); /* * Write the new group cache entry to disk. Sjwritegrp() knows where * the cache buffer begins, and forces out the group descriptor we * just set up. */ if (_sjwritegrp(item, grpno) == SM_FAIL) { sjunwait_io(item); return (-1); } _sjregnblocks(reln->rd_id, 0); /* can now release i/o lock on the item we just added */ sjunwait_io(item); /* no longer need the reference */ _sjunpin(item); /* last thing to do is to create the mag-disk file to hold last page */ if (group->sjgd_dbid == (ObjectId) 0) d734 1 a734 1 * group we're kicking out, if indeed we're doing that. d743 63 a805 4 /* see if we can avoid doing any work here */ if (SJHeader->sjh_nentries < SJCACHESIZE) { *grpno = SJHeader->sjh_nentries; SJHeader->sjh_nentries++; d807 19 a825 2 /* XXX here, need to kick someone out */ elog(FATAL, "hey mao, your cache appears to be full."); d828 17 a844 1 item = &SJCache[*grpno]; d846 3 a848 8 item->sjc_lruprev = -1; item->sjc_lrunext = SJHeader->sjh_lruhead; if (SJHeader->sjh_lruhead == -1) { SJHeader->sjh_lruhead = *grpno; SJHeader->sjh_lrutail = *grpno; } else { SJCache[SJHeader->sjh_lruhead].sjc_lruprev = *grpno; SJHeader->sjh_lruhead = *grpno; d851 4 a854 2 /* bump ref count */ _sjtouch(item); d856 1 a856 1 return (item); d877 1 a877 6 entry = (SJHashEntry *) hash_search(SJCacheHT, &tag, HASH_FIND, &found); if (entry == (SJHashEntry *) NULL) { SpinRelease(SJCacheLock); elog(FATAL, "_sjfetchgrp: hash table corrupted"); } d888 1 a888 1 _sjtouch(item, *grpno); d892 49 a940 3 SpinRelease(SJCacheLock); elog(FATAL, "_sjfetchgrp: hey mao: can't find <%d,%d,%d>", dbid, relid, blkno); d946 97 d1044 1 a1044 1 _sjtouch(item, grpno) a1045 1 int grpno; d1047 6 a1052 2 /* first bump the ref count */ (item->sjc_refcount)++; d1054 5 a1058 3 /* now move it to the top of the lru list */ if (item->sjc_lruprev == -1) return; d1060 5 a1064 4 if (item->sjc_lrunext == -1) SJHeader->sjh_lrutail = item->sjc_lruprev; else SJCache[item->sjc_lrunext].sjc_lruprev = item->sjc_lruprev; d1066 2 a1067 1 SJCache[item->sjc_lruprev].sjc_lrunext = item->sjc_lrunext; d1069 3 a1071 3 item->sjc_lruprev = -1; item->sjc_lrunext = SJHeader->sjh_lruhead; SJHeader->sjh_lruhead = grpno; d1074 9 d1087 3 d1091 2 d1095 23 a1117 1 (item->sjc_refcount)--; d1149 1 a1149 1 seekpos = grpno * SJBufSize; d1153 1 a1153 1 nbytes = SJBufSize; d1180 1 a1180 1 seekpos = grpno * SJBufSize; d1186 1 a1186 1 nbytes = SJBufSize; d1231 1 d1235 1 a1235 1 base = nblocks / SJGRPSIZE; d1247 5 a1251 1 entry = (SJHashEntry *) hash_search(SJCacheHT, &tag, HASH_FIND, &found); d1253 3 a1255 3 if (entry == (SJHashEntry *) NULL) { SpinRelease(SJCacheLock); elog(FATAL, "sjextend: cache hash table corrupted"); d1258 2 d1268 11 a1278 1 _sjtouch(item, grpno); d1280 6 a1285 5 for (blkno = 0; blkno < SJGRPSIZE; blkno++) { if (item->sjc_flags[blkno] & SJC_MISSING) { item->sjc_flags[blkno] &= ~SJC_MISSING; item->sjc_flags[blkno] |= SJC_DIRTY; break; d1287 2 d1291 1 a1291 6 if (blkno == SJGRPSIZE) { SpinRelease(SJCacheLock); elog(WARN, "sjextend: hey mao: no missing blocks to extend"); } item->sjc_gflags |= SJC_IOINPROG; d1293 1 a1293 8 #ifdef HAS_TEST_AND_SET SpinRelease(SJCacheLock); S_LOCK(item->sjc_iolock); #else /* HAS_TEST_AND_SET */ (*SJNWaiting)++; SpinRelease(SJCacheLock); IpcSemaphoreLock(SJWaitSemId, 0, 1); #endif /* HAS_TEST_AND_SET */ d1296 1 a1296 1 sjunwait_io(item); d1303 9 d1313 1 a1313 1 sjunwait_io(item); d1317 1 a1317 1 sjunwait_io(item); d1320 2 a1321 1 _sjregnblocks(reln->rd_id, ++nblocks); d1326 133 d1504 1 a1504 10 item->sjc_gflags |= SJC_IOINPROG; #ifdef HAS_TEST_AND_SET SpinRelease(SJCacheLock); S_LOCK(item->sjc_iolock); #else /* HAS_TEST_AND_SET */ (*SJNWaiting)++; SpinRelease(SJCacheLock); IpcSemaphoreLock(SJWaitSemId, 0, 1); #endif /* HAS_TEST_AND_SET */ d1507 1 a1507 1 sjunwait_io(item); d1514 1 a1514 1 sjunwait_io(item); a1551 1 item->sjc_gflags |= SJC_IOINPROG; d1553 1 a1553 8 #ifdef HAS_TEST_AND_SET SpinRelease(SJCacheLock); S_LOCK(item->sjc_iolock); #else /* HAS_TEST_AND_SET */ (*SJNWaiting)++; SpinRelease(SJCacheLock); IpcSemaphoreLock(SJWaitSemId, 0, 1); #endif /* HAS_TEST_AND_SET */ d1556 1 a1556 1 sjunwait_io(item); d1565 1 a1565 1 sjunwait_io(item); d1570 1 a1570 1 sjunwait_io(item); d1600 3 a1602 2 * This is an unbelievably expensive operation. We should cache this * number in shared memory once we compute it. d1609 1 a1609 18 Relation plmap; TupleDescriptor plmdesc; HeapScanDesc plmscan; HeapTuple plmtup; Buffer buf; ObjectId reldbid; Datum d; Boolean n; int32 v; int32 maxblkno; int i; int grpno; SJCacheItem *item; ScanKeyEntryData plmkey[2]; /* see if we've already figured this out */ if ((maxblkno = _sjfindnblocks(reln->rd_id)) >= 0) return (maxblkno); d1612 1 a1612 1 reldbid = (ObjectId) 0; d1614 1 a1614 1 reldbid = MyDatabaseId; d1616 1 a1616 45 ScanKeyEntryInitialize(&plmkey[0], 0x0, Anum_pg_plmap_pldbid, ObjectIdEqualRegProcedure, ObjectIdGetDatum(reldbid)); ScanKeyEntryInitialize(&plmkey[1], 0x0, Anum_pg_plmap_plrelid, ObjectIdEqualRegProcedure, ObjectIdGetDatum(reln->rd_id)); plmap = heap_openr(Name_pg_plmap); plmdesc = RelationGetTupleDescriptor(plmap); plmscan = heap_beginscan(plmap, false, NowTimeQual, 2, &plmkey[0]); maxblkno = 0; /* * Find the highest-numbered group in the relation by scanning * pg_plmap. */ while (HeapTupleIsValid(plmtup = heap_getnext(plmscan, false, (Buffer *) NULL))) { d = (Datum) heap_getattr(plmtup, InvalidBuffer, Anum_pg_plmap_plblkno, plmdesc, &n); v = DatumGetInt32(d); if (v > maxblkno) maxblkno = v; } heap_endscan(plmscan); heap_close(plmap); /* * Get the highest-numbered group, and count the number of blocks * that are actually present in the group. */ item = _sjfetchgrp(reldbid, reln->rd_id, maxblkno, &grpno); for (i = 0; i < SJGRPSIZE; i++) { if (item->sjc_flags[i] & SJC_MISSING) break; } /* don't need the reference anymore */ _sjunpin(item); d1618 1 a1618 3 /* adjust the count of blocks and remember it for next time */ maxblkno += i; _sjregnblocks(reln->rd_id, maxblkno); d1620 1 a1620 1 return(maxblkno); d1624 1 a1624 3 * _sjfindnblocks() -- Find a precomputed block count for the given relid. * * We should really do something smarter here. d1628 2 a1629 2 _sjfindnblocks(relid) ObjectId relid; d1632 2 d1635 1 d1639 1 a1639 1 if (l->sjnb_relid == relid) d1645 15 a1659 1 return (-1); a1663 2 * * Should really do something smarter here. d1667 2 a1668 3 _sjregnblocks(relid, nblocks) ObjectId relid; int nblocks; d1670 1 d1672 1 d1679 4 a1682 3 if (l->sjnb_relid == relid) { l->sjnb_nblocks = nblocks; return; a1683 1 d1688 33 a1720 5 l = (SJNBlock *) palloc(sizeof(SJNBlock)); l->sjnb_relid = relid; l->sjnb_nblocks = nblocks; l->sjnb_next = SJNBlockList; SJNBlockList = l; d1823 1 a1823 1 seekpos = i * SJBufSize; d1871 3 a1873 3 printf("jukebox cache metdata: size %d, %d entries, lru head %d tail %d", SJCACHESIZE, nentries, SJHeader->sjh_lruhead, SJHeader->sjh_lrutail); d1880 1 a1880 1 for (i = 0; i < nentries; i++) { d1885 1 a1885 1 item->sjc_lrunext, item->sjc_lruprev, @ 1.9 log @hash table operators should not have same names as grammar tokens @ text @d34 1 a34 1 RcsId("$Header: RCS/sj.c,v 1.8 91/07/26 00:52:21 mao Exp Locker: mer $"); d36 1 a36 140 /* * When the buffer pool requests a particular page, we load a group of * pages from the jukebox into the mag disk cache for efficiency. * SJCACHESIZE is the number of these groups in the disk cache. Every * group is represented by one entry in the shared memory cache. SJGRPSIZE * is the number of 8k pages in a group. */ #define SJCACHESIZE 64 /* # groups in mag disk cache */ #define SJGRPSIZE 10 /* # 8k pages in a group */ #define SJPATHLEN 64 /* size of path to cache file */ /* misc constants */ #define SJCACHENAME "_sj_cache_" /* relative to $POSTGRESHOME/data */ #define SJMETANAME "_sj_meta_" /* relative to $POSTGRESHOME/data */ /* bogus macros */ #define RelationSetLockForExtend(r) /* * SJGroupDesc -- Descriptor block for a cache group. * * The first 1024 bytes in a group -- on a platter or in the magnetic * disk cache -- are a descriptor block. We choose 1024 bytes because * this is the native block size of the jukebox. * * This block includes a description of the data that appears in the * group, including relid, dbid, relname, dbname, and a unique OID * that we use to verify cache consistency on startup. SJGroupDesc * is the structure that contains this information. It resides at the * start of the 1024-byte block; the rest of the block is unused. */ typedef struct SJGroupDesc { long sjgd_magic; long sjgd_version; NameData sjgd_dbname; NameData sjgd_relname; ObjectId sjgd_dbid; ObjectId sjgd_relid; long sjgd_relblkno; long sjgd_jboffset; long sjgd_extentsz; ObjectId sjgd_groupoid; } SJGroupDesc; #define SJGDMAGIC 0x060362 #define SJGDVERSION 0 #define JBBLOCKSZ 1024 /* * SJCacheTag -- Unique identifier for individual groups in the magnetic * disk cache. * * We use this identifier to query the shared memory cache metadata * when we want to find a particular group. */ typedef struct SJCacheTag { ObjectId sjct_dbid; /* database OID of this group */ ObjectId sjct_relid; /* relation OID of this group */ BlockNumber sjct_base; /* number of first block in group */ } SJCacheTag; /* * SJHashEntry -- The hash table code returns a pointer to a structure * that has this layout. */ typedef struct SJHashEntry { SJCacheTag sjhe_tag; /* cache tag -- hash key */ int sjhe_groupno; /* which group this is in cache file */ } SJHashEntry; /* * SJCacheHeader -- Header data for in-memory metadata cache. */ typedef struct SJCacheHeader { int sjh_nentries; int sjh_lruhead; int sjh_lrutail; uint32 sjh_flags; #define SJH_INITING (1 << 0) #define SJH_INITED (1 << 1) #ifdef HAS_TEST_AND_SET slock_t sjh_initlock; /* initialization in progress lock */ #endif /* HAS_TEST_AND_SET */ } SJCacheHeader; /* * SJCacheItem -- Cache item describing blocks on the magnetic disk cache. * * An array of these is maintained in shared memory, with one entry * for every group that appears in the magnetic disk block cache. We * maintain a consistent copy of this array on magnetic disk whenever * we change the cache contents. This is because the magnetic disk * cache is persistent, and contains data that logically appears on the * jukebox between backend instances. * * The OID that appears in this structure is used to detect corruption * of the cache due to crashes during cache metadata update on disk. * When we detect corruption, we recover by marking the group free. We * are very careful to do this in a way that guarantees no data is lost, * and that does not require log processing. * * Since we never return pointers to private data, we don't need to * maintain a free list or pin count on magnetic disk cache groups. * In shared memory, we maintain a list of groups in LRU order (offsets * from the start of cache metadata are stored in this structure). * When we need a group for data transfer, we use the least-recently-used * group's space, kicking it out to the platter if necessary. * * Groups on the jukebox include one page (the first) that describes the * group, including its dbid, relid, dbname, relname, and extent size. * This page also includes the OID described above. */ typedef struct SJCacheItem { SJCacheTag sjc_tag; /* dbid, relid, group triple */ int sjc_lruprev; /* LRU list pointer */ int sjc_lrunext; /* LRU list pointer */ int sjc_refcount; /* number of active refs */ ObjectId sjc_oid; /* OID of group */ uint8 sjc_gflags; /* flags for entire group */ #define SJG_CLEAR (uint8) 0x0 #define SJG_IOINPROG (1 << 0) uint8 sjc_flags[SJGRPSIZE]; /* flag bytes, 1 per block */ #define SJC_DIRTY (1 << 0) #define SJC_MISSING (1 << 1) #define SJC_ONPLATTER (1 << 2) a37 23 #ifdef HAS_TEST_AND_SET slock_t sjc_iolock; /* transfer in progress */ #endif /* HAS_TEST_AND_SET */ } SJCacheItem; /* * SJNBlock -- Linked list of count of blocks in relations. * * Computing a block count is so expensive that we cache the count * in local space when we've done the work. This is really a stupid * way to do it -- we'd rather do it in shared memory and have the * computed count survive transactions -- but this will work for now. */ typedef struct SJNBlock { ObjectId sjnb_relid; int sjnb_nblocks; struct SJNBlock *sjnb_next; } SJNBlock; d40 2 a41 3 extern bool IsPostmaster; /* is this the postmaster running? */ extern ObjectId MyDatabaseId; /* OID of database we have open */ extern Name MyDatabaseName; /* name of database we have open */ a70 1 static int SJBufSize = ((BLCKSZ * SJGRPSIZE) + JBBLOCKSZ); d72 2 a73 14 /* routines declared here */ extern void sjcacheinit(); extern void sjwait_init(); extern void sjunwait_init(); extern void sjwait_io(); extern void sjunwait_io(); extern void sjtouch(); extern void sjunpin(); extern void sjregister(); extern void sjregnblocks(); extern int sjfindnblocks(); extern ObjectId sjchoose(); extern SJCacheItem *sjallocgrp(); extern SJCacheItem *sjfetchgrp(); d75 18 a198 9 if (IsPostmaster) { if (metafound) elog(FATAL, "sj cache found in shared memory by postmaster!"); bzero((char *) cachesave, metasize); return (SM_SUCCESS); } d235 1 a235 1 sjunwait_init(); d251 1 a251 1 sjunwait_init(); d262 4 a265 4 * Finally, if it's our responsibility to initialize the shared-memory * cache metadata, then go do that. sjcacheinit() will elog(FATAL, ...) * if it can't initialize the cache, so we don't need to worry about * a return value here. d269 1 a269 1 sjcacheinit(); d272 8 d283 2 a284 2 void sjcacheinit() d308 1 a308 1 sjunwait_init(); d331 1 a331 1 sjunwait_init(); d337 1 a337 1 sjunwait_init(); d353 2 a354 2 /* not waiting on I/O or anything, no active references to this guy */ cur->sjc_gflags = SJG_CLEAR; d375 1 a375 1 * sjunwait_init() -- Release initialization lock on the jukebox cache. d384 1 a384 1 * finish, we call sjunwait_init() to release the initialization lock d392 2 a393 2 void sjunwait_init() d424 1 a424 1 item->sjc_gflags &= ~SJG_IOINPROG; d443 1 a443 1 * sjwait_init() -- Wait for cache initialization to complete. d451 2 a452 2 void sjwait_init() d466 1 a466 1 * sjwait_io() -- Wait for group IO to complete. d475 2 a476 2 void sjwait_io(item) d539 1 a539 1 sjwait_init(); d551 1 a551 1 item = sjallocgrp(&grpno); d572 1 a572 1 item->sjc_gflags = SJG_IOINPROG; d583 1 a607 2 group->sjgd_jboffset = -1; group->sjgd_extentsz = (SJBufSize / JBBLOCKSZ); d610 19 a628 1 if (sjwritegrp(item, grpno) == SM_FAIL) { d633 1 a633 3 /* record presence of new extent in system catalogs */ sjregister(item, group->sjgd_jboffset, group->sjgd_extentsz); sjregnblocks(reln->rd_id, 0); d639 1 a639 1 sjunpin(item); d655 1 a655 1 * sjregister() -- Make catalog entry for a new extent d666 2 a667 2 void sjregister(item, jboffset, extentsz) d669 1 a669 2 int32 jboffset; int32 extentsz; d676 9 a684 4 plmap = heap_openr(Name_pg_plmap); RelationSetLockForWrite(plmap); plmdata = (Form_pg_plmap) palloc(sizeof(FormData_pg_plmap)); d686 1 a686 2 /* choose a platter to put the new extent on */ plmdata->plid = sjchoose(item); d688 4 a691 6 /* init the rest of the fields */ plmdata->pldbid = item->sjc_tag.sjct_dbid; plmdata->plrelid = item->sjc_tag.sjct_relid; plmdata->plblkno = item->sjc_tag.sjct_base; plmdata->ploffset = jboffset; plmdata->plextentsz = extentsz; d701 4 d708 4 d713 1 a713 1 heap_close(plmap); d717 1 a717 1 * sjchoose() -- Choose a platter to receive a new extent. d723 2 a724 2 ObjectId sjchoose(item) d728 1 d732 1 d734 8 d744 1 d749 9 a757 1 elog(WARN, "sjchoose: no platters in pg_plmap"); d759 1 a759 1 plid = plattup->t_oid; a760 1 d764 13 a776 1 return (plid); d780 1 a780 1 * sjallocgrp() -- Allocate a new group in the cache for use by some d792 2 a793 2 SJCacheItem * sjallocgrp(grpno) d820 1 a820 1 sjtouch(item); d825 2 a826 2 SJCacheItem * sjfetchgrp(dbid, relid, blkno, grpno) d847 1 a847 1 elog(FATAL, "sjfetchgrp: hash table corrupted"); d854 3 a856 3 if (item->sjc_gflags & SJG_IOINPROG) { sjwait_io(item); return (sjfetchgrp(dbid, relid, blkno)); d859 1 a859 1 sjtouch(item, *grpno); d864 1 a864 1 elog(FATAL, "sjfetchgrp: hey mao: can't find <%d,%d,%d>", d871 2 a872 2 void sjtouch(item, grpno) d895 2 a896 2 void sjunpin(item) d901 1 a901 1 elog(FATAL, "sjunpin: illegal reference count"); d906 2 a907 2 int sjwritegrp(item, grpno) d953 2 a954 2 int sjreadgrp(item, grpno) d967 1 a967 1 elog(NOTICE, "sjreadgrp: cannot seek"); d976 1 a976 1 elog(NOTICE, "sjreadgrp: read failed"); d988 1 a988 1 elog(NOTICE, "sjreadgrp: trashed cache"); d1046 1 a1046 1 sjtouch(item, grpno); d1061 1 a1061 1 item->sjc_gflags = SJG_IOINPROG; d1072 1 a1072 1 if (sjreadgrp(item, grpno) == SM_FAIL) { d1080 1 a1080 1 if (sjwritegrp(item, grpno) == SM_FAIL) { d1086 1 a1086 1 sjunpin(item); d1088 1 a1088 1 sjregnblocks(reln->rd_id, ++nblocks); d1133 1 a1133 1 item = sjfetchgrp(reldbid, reln->rd_id, blocknum / SJGRPSIZE, &grpno); d1135 1 a1135 1 /* shd expand sjfetchgrp() inline to avoid extra semop()s */ d1138 1 a1138 1 item->sjc_gflags = SJG_IOINPROG; d1149 1 a1149 1 if (sjreadgrp(item, grpno) == SM_FAIL) { d1158 1 a1158 1 sjunpin(item); d1180 1 a1180 1 item = sjfetchgrp(reldbid, reln->rd_id, blocknum / SJGRPSIZE, &grpno); d1182 1 a1182 1 /* shd expand sjfetchgrp() inline to avoid extra semop()s */ d1189 1 a1189 1 sjunpin(item); d1195 1 a1195 1 item->sjc_gflags = SJG_IOINPROG; d1206 1 a1206 1 if (sjreadgrp(item, grpno) == SM_FAIL) { d1208 1 a1208 1 sjunpin(item); d1215 1 a1215 1 if (sjwritegrp(item, grpno) == SM_FAIL) { d1217 1 a1217 1 sjunpin(item); d1222 1 a1222 1 sjunpin(item); d1275 1 a1275 1 if ((maxblkno = sjfindnblocks(reln->rd_id)) >= 0) d1319 1 a1319 1 item = sjfetchgrp(reldbid, reln->rd_id, maxblkno, &grpno); d1327 1 a1327 1 sjunpin(item); d1331 1 a1331 1 sjregnblocks(reln->rd_id, maxblkno); d1337 1 a1337 1 * sjfindnblocks() -- Find a precomputed block count for the given relid. d1342 2 a1343 2 int sjfindnblocks(relid) d1361 1 a1361 1 * sjregnblocks() -- Remember the count of blocks for this relid. d1366 2 a1367 2 void sjregnblocks(relid, nblocks) d1434 2 d1441 3 d1447 85 d1553 1 a1553 1 printf(" [%2d] <%ld,%ld,%ld> next %d prev %d flags %s oid %ld\n", d1555 3 a1557 3 item->sjc_tag.sjct_base, item->sjc_lrunext, item->sjc_lruprev, (item->sjc_gflags & SJG_IOINPROG ? "IO_IN_PROG" : "CLEAR"), @ 1.8 log @bug fix -- was passing a structure instead of a pointer. you've got to be careful what you dereference, these days. @ text @d34 1 a34 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.7 1991/07/24 23:37:03 mao Exp mao $"); d475 1 a475 1 ENTER, &found); d720 1 a720 1 entry = (SJHashEntry *) hash_search(SJCacheHT, item, ENTER, &found); d948 1 a948 1 entry = (SJHashEntry *) hash_search(SJCacheHT, &tag, FIND, &found); d1136 1 a1136 1 entry = (SJHashEntry *) hash_search(SJCacheHT, &tag, FIND, &found); @ 1.7 log @clean up conditional compilation, fix bug in initialization code for platforms with test and set locks @ text @d34 1 a34 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.6 1991/07/24 07:47:24 mao Exp mao $"); d1206 1 a1206 1 path = relpath(reln->rd_rel->relname); @ 1.6 log @fix size computations, add main memory storage manager @ text @d34 1 a34 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.5 1991/07/22 22:21:11 mao Exp mao $"); d375 1 a380 1 SJHeader->sjh_flags = SJH_INITING; a1584 117 } #else /* SONY_JUKEBOX */ #include "machine.h" #include "storage/smgr.h" #include "utils/rel.h" /* * If there's no sony jukebox, we just use stub routines. */ int sjinit(unused) int unused; { return (SM_SUCCESS); } int sjshutdown() { return (SM_SUCCESS); } int sjcreate(reln) Relation reln; { return (-1); } int sjunlink(reln) Relation reln; { return (SM_FAIL); } int sjextend(reln, buffer) Relation reln; char *buffer; { return (SM_FAIL); } int sjopen(reln) Relation reln; { return (-1); } int sjclose(reln) Relation reln; { return (SM_FAIL); } int sjread(reln, blocknum, buffer) Relation reln; BlockNumber blocknum; char *buffer; { return (SM_FAIL); } int sjwrite(reln, blocknum, buffer) Relation reln; BlockNumber blocknum; char *buffer; { return (SM_FAIL); } int sjflush(reln, blocknum, buffer) Relation reln; BlockNumber blocknum; char *buffer; { return (SM_FAIL); } int sjblindwrt(dbstr, relstr, dbid, relid, blkno, buffer) char *dbstr; char *relstr; OID dbid; OID relid; BlockNumber blkno; char *buffer; { return (SM_FAIL); } int sjnblocks(reln) Relation reln; { return (-1); } int sjcommit() { return (SM_SUCCESS); } int sjabort() { return (SM_SUCCESS); @ 1.5 log @jukebox storage manager installation @ text @d34 1 a34 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.4 1991/07/22 08:00:36 mao Exp mao $"); a1531 8 nbuckets = 1 << my_log2((SJCACHESIZE - 1) / DEF_FFACTOR + 1); nsegs = 1 << my_log2((nbuckets - 1) / DEF_SEGSIZE + 1); /* size of shared memory binding table */ size = my_log2(BTABLE_SIZE) + sizeof(HHDR) + DEF_SEGSIZE * sizeof(SEGMENT) + BUCKET_ALLOC_INCR * (sizeof(BUCKET_INDEX) + BTABLE_KEYSIZE + BTABLE_DATASIZE); d1533 1 a1533 1 size += ((SJCACHESIZE + 1) * sizeof(SJCacheItem)) + sizeof(SJCacheHeader); @ 1.4 log @added code for just about everything, still pretty buggy @ text @d34 1 a34 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.3 1991/07/22 05:32:38 mao Exp mao $"); d840 5 a1068 11 long sjgd_magic; long sjgd_version; NameData sjgd_dbname; NameData sjgd_relname; ObjectId sjgd_dbid; ObjectId sjgd_relid; long sjgd_relblkno; long sjgd_jboffset; long sjgd_extentsz; ObjectId sjgd_groupoid; d1407 3 a1409 2 while (HeapTupleIsValid(plmtup = heap_getnext(plmscan, false, &buf))) { d = (Datum) heap_getattr(plmtup, buf, Anum_pg_plmap_plblkno, a1410 1 ReleaseBuffer(buf); d1501 1 a1501 8 SJNBlock *l; while (SJNBlockList != (SJNBlock *) NULL) { l = SJNBlockList; SJNBlockList = SJNBlockList->sjnb_next; pfree(l); } d1510 1 a1510 8 SJNBlock *l; while (SJNBlockList != (SJNBlock *) NULL) { l = SJNBlockList; SJNBlockList = SJNBlockList->sjnb_next; pfree(l); } @ 1.3 log @more stuff working -- create updates catalogs, etc @ text @d34 1 a34 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.2 1991/07/21 23:13:32 mao Exp mao $"); d52 3 d583 2 d733 1 d737 1 a740 3 /* okay, cache and group are set up -- safe to release excl lock now */ SpinRelease(SJCacheLock); d777 1 d787 1 a787 1 sprintf(path, "../%16s", &(reln->rd_rel->relname.data[0])); d789 1 a789 1 sprintf(path, "%16s", &(reln->rd_rel->relname.data[0])); d791 2 d926 1 a926 1 sjfetchgrp(dbid, relid, blkno) d930 1 d951 2 a952 1 item = &(SJCache[entry->sjhe_groupno]); d959 1 a959 1 sjtouch(item, entry->sjhe_groupno); d1003 1 d1054 53 d1118 84 a1201 1 return (SM_FAIL); d1208 9 a1216 1 return (-1); d1223 3 a1225 1 return (SM_FAIL); d1234 38 a1271 1 return (SM_FAIL); d1280 56 a1335 1 return (SM_FAIL); d1344 1 a1344 1 return (SM_FAIL); a1365 2 #define RelationSetLockForExtend(r) d1381 1 d1383 1 a1383 4 ScanKeyEntry plmkey[2]; /* need to guarantee reln doesn't change size while we're thinking */ RelationSetLockForExtend(reln); d1430 1 a1430 1 item = sjfetchgrp(reldbid, reln->rd_id, maxblkno); @ 1.2 log @checkpoint -- sony jukebox manager starting to work @ text @d16 1 d22 1 d27 6 a32 1 RcsId("$Header: /users/mao/postgres/src/storage/smgr/RCS/sj.c,v 1.1 1991/07/09 00:12:09 mao Exp mao $"); d34 2 d160 1 d182 15 d208 1 d234 13 a246 5 extern void sjcacheinit(); extern void sjwait_init(); extern void sjunwait_init(); extern void sjwait_io(); extern void sjunwait_io(); d422 3 d509 1 a509 1 /* not waiting on I/O or anything */ d511 1 d705 1 a705 2 grpno = sjallocgrp(); item = &SJCache[grpno]; d763 1 a763 1 group->sjgd_extentsz = -1; d771 3 d777 3 d792 81 d885 3 a887 2 int sjallocgrp() a888 1 int grpno; d893 1 a893 1 grpno = SJHeader->sjh_nentries; d900 1 a900 1 item = &SJCache[grpno]; d905 48 a952 2 SJHeader->sjh_lruhead = grpno; SJHeader->sjh_lrutail = grpno; d954 3 a956 2 SJCache[SJHeader->sjh_lruhead].sjc_lruprev = grpno; SJHeader->sjh_lruhead = grpno; d959 35 a993 1 return (grpno); d1111 9 d1124 100 d1227 33 d1263 10 d1279 10 @ 1.1 log @Initial revision @ text @d4 2 a5 1 * This code manages relations that reside on magnetic disk. d8 2 d13 2 d16 2 d19 3 d23 917 d941 2 a942 1 RcsId("$Header$"); d944 66 d1011 1 a1011 1 * Only stub routines right now. d1015 2 a1016 1 sjinit() d1120 2 @