/*
 *  md.c -- magnetic disk storage manager.
 *
 *	This code manages relations that reside on magnetic disk.
 */

#include <sys/file.h>

#include "tmp/c.h"
#include "tmp/postgres.h"

#include "machine.h"
#include "storage/smgr.h"
#include "storage/block.h"
#include "storage/fd.h"
#include "utils/rel.h"

RcsId("$Header: /usr/local/dev/postgres/mastertree/src/storage/smgr/RCS/md.c,v 1.13 1992/08/13 22:53:01 mao Exp $");

/*
 *  Need to keep track of open file descriptors under the magnetic disk
 *  storage manager.
 */

static int	Nfds = 100;	/* must be same as in storage/file/fd.c */
static char	*Md_fdvec;

#define MDFD_CLEAN	(char) 0
#define MDFD_DIRTY	(char) 1

/*
 *  mdinit() -- Initialize private state for magnetic disk storage manager.
 *
 *	We keep a private table of all file descriptors.  Whenever we do
 *	a write to one, we mark it dirty in our table.  Whenever we force
 *	changes to disk, we mark the file descriptor clean.  At transaction
 *	commit, we force changes to disk for all dirty file descriptors.
 *	This routine allocates and initializes the table.
 *
 *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
 */

int
mdinit()
{
    if ((Md_fdvec = (char *) malloc(Nfds)) == (char *) NULL)
	return (SM_FAIL);

    (void) bzero(Md_fdvec, Nfds);

    return (SM_SUCCESS);
}

int
mdcreate(reln)
    Relation reln;
{
    int fd;
    int tmp;
    char *path;
    extern char *relpath();
    extern bool IsBootstrapProcessingMode();

    path = relpath(&(reln->rd_rel->relname.data[0]));
    fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);

    /*
     *  If the file already exists and is empty, we pretend that the
     *  create succeeded.  During bootstrap processing, we skip that check,
     *  because pg_time, pg_variable, and pg_log get created before their
     *  .bki file entries are processed.
     */

    if (fd < 0) {
	if ((fd = FileNameOpenFile(path, O_RDWR, 0600)) >= 0) {
	    if (!IsBootstrapProcessingMode() &&
		FileRead(fd, (char *) &tmp, sizeof(tmp)) != 0) {
		FileClose(fd);
		return (-1);
	    }
	}
    }

    if (fd >= Nfds)
	if (_fdvec_ext(fd) == SM_FAIL)
	    return (-1);

    return (fd);
}

/*
 *  mdunlink() -- Unlink a relation.
 */

int
mdunlink(reln)
    Relation reln;
{
    FileUnlink(RelationGetFile(reln));

    return (SM_SUCCESS);
}

/*
 *  mdextend() -- Add a block to the specified relation.
 *
 *	This routine returns SM_FAIL or SM_SUCCESS, with errno set as
 *	appropriate.
 */

int
mdextend(reln, buffer)
    Relation reln;
    char *buffer;
{
    long pos;
    File vfd;

    vfd = RelationGetFile(reln);

    if ((pos = FileSeek(vfd, 0L, L_XTND)) < 0)
	return (SM_FAIL);

    if (FileWrite(vfd, buffer, BLCKSZ) != BLCKSZ)
	return (SM_FAIL);

    /* remember that we did a write, so we can sync at xact commit */
    Md_fdvec[vfd] = MDFD_DIRTY;

    return (SM_SUCCESS);
}

/*
 *  mdopen() -- Open the specified relation.
 *
 *	The magnetic disk storage manager uses one file descriptor per open
 *	relation.  This routine returns the open file descriptor.
 */

int
mdopen(reln)
    Relation reln;
{
    char *path;
    int fd;
    extern char *relpath();

    path = relpath(&(reln->rd_rel->relname.data[0]));

    fd = FileNameOpenFile(path, O_RDWR, 0600);

    /* this should only happen during bootstrap processing */
    if (fd < 0)
	fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);

    if (fd >= Nfds)
	if (_fdvec_ext(fd) == SM_FAIL)
	    return (-1);

    return (fd);
}

/*
 *  mdclose() -- Close the specified relation.
 *
 *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
 */

int
mdclose(reln)
    Relation reln;
{
    File vfd;
    int status;

    /* maybe it's already closed... */
    if ((vfd = RelationGetFile(reln)) < 0)
	return (SM_SUCCESS);

    /*
     *  Need to do it here.  We sync the file descriptor so that we don't
     *  need to reopen it at transaction commit to force changes to disk.
     */

    FileSync(vfd);
    FileClose(vfd);

    /* mark this file descriptor as clean in our private table */
    Md_fdvec[vfd] = MDFD_CLEAN;

    return (SM_SUCCESS);
}

/*
 *  mdread() -- Read the specified block from a relation.
 *
 *	Returns SM_SUCCESS or SM_FAIL.
 */

int
mdread(reln, blocknum, buffer)
    Relation reln;
    BlockNumber blocknum;
    char *buffer;
{
    int status;
    File vfd;
    long seekpos;
    int nbytes;

    if ((vfd = RelationGetFile(reln)) < 0) {
	if ((vfd = mdopen(reln)) < 0)
	    return (SM_FAIL);
	reln->rd_fd = vfd;
    }

    seekpos = (long) (BLCKSZ * blocknum);
    if (FileSeek(vfd, seekpos, L_SET) != seekpos) {
	return (SM_FAIL);
    }

    status = SM_SUCCESS;
    if ((nbytes = FileRead(vfd, buffer, BLCKSZ)) != BLCKSZ) {
	if (nbytes == 0) {
	    (void) bzero(buffer, BLCKSZ);
	} else {
	    status = SM_FAIL;
	}
    }

    return (status);
}

/*
 *  mdwrite() -- Write the supplied block at the appropriate location.
 *
 *	Returns SM_SUCCESS or SM_FAIL.
 */

int
mdwrite(reln, blocknum, buffer)
    Relation reln;
    BlockNumber blocknum;
    char *buffer;
{
    int status;
    bool found;
    File vfd;
    long seekpos;

    found = true;
    if ((vfd = RelationGetFile(reln)) < 0) {
	found = false;
	if ((vfd = mdopen(reln)) < 0)
	    return (SM_FAIL);
    }

    seekpos = (long) (BLCKSZ * blocknum);
    if (FileSeek(vfd, seekpos, L_SET) != seekpos) {
	if (!found)
	    (void) FileClose(vfd);

	return (SM_FAIL);
    }

    status = SM_SUCCESS;
    if (FileWrite(vfd, buffer, BLCKSZ) != BLCKSZ)
	status = SM_FAIL;

    /*
     *  If we opened this file descriptor especially to write this one block,
     *  force the change to disk and mark the descriptor as clean.  If we
     *  had an open descriptor for the file already, mark it as dirty so
     *  we'll flush it at commit time.
     */

    if (!found) {
	FileSync(vfd);
	FileClose(vfd);
	Md_fdvec[vfd] = MDFD_CLEAN;
    } else {
	Md_fdvec[vfd] = MDFD_DIRTY;
    }

    return (status);
}

/*
 *  mdflush() -- Synchronously write a block to disk.
 *
 *	This is exactly like mdwrite(), but doesn't return until the file
 *	system buffer cache has been flushed.
 */

int
mdflush(reln, blocknum, buffer)
    Relation reln;
    BlockNumber blocknum;
    char *buffer;
{
    int status;
    bool found;
    File vfd;
    long seekpos;

    found = true;
    if ((vfd = RelationGetFile(reln)) < 0) {
	found = false;
	if ((vfd = mdopen(reln)) < 0)
	    return (SM_FAIL);
    }

    seekpos = (long) (BLCKSZ * blocknum);
    if (FileSeek(vfd, seekpos, L_SET) != seekpos) {
	if (!found)
	    (void) FileClose(vfd);

	return (SM_FAIL);
    }

    status = SM_SUCCESS;

    /* write and sync the block */
    if (FileWrite(vfd, buffer, BLCKSZ) != BLCKSZ || FileSync(vfd) < 0)
	status = SM_FAIL;

    /*
     *  By here, the block is written and changes have been forced to stable
     *  storage.  Mark the descriptor as clean until the next write, so we
     *  don't sync it again unnecessarily at transaction commit.
     */

    Md_fdvec[vfd] = MDFD_CLEAN;

    if (!found)
	FileClose(vfd);

    return (status);
}

/*
 *  mdblindwrt() -- Write a block to disk blind.
 *
 *	We have to be able to do this using only the name and OID of
 *	the database and relation in which the block belongs.  This
 *	is a synchronous write.
 */

int
mdblindwrt(dbstr, relstr, dbid, relid, blkno, buffer)
    char *dbstr;
    char *relstr;
    OID dbid;
    OID relid;
    BlockNumber blkno;
    char *buffer;
{
    int fd;
    long seekpos;
    int status;
    char path[64];

    /* construct the path to the file and open it */
    sprintf(path, "../%s/%s", (dbid == (OID) 0 ? ".." : dbstr), relstr);
    if ((fd = open(path, O_RDWR, 0600)) < 0)
	return (SM_FAIL);

    /* seek to the right spot */
    seekpos = (long) (BLCKSZ * blkno);
    if (lseek(fd, seekpos, L_SET) != seekpos) {
	(void) close(fd);
	return (SM_FAIL);
    }

    status = SM_SUCCESS;

    /* write and sync the block */
    if (write(fd, buffer, BLCKSZ) != BLCKSZ || fsync(fd) < 0)
	status = SM_FAIL;

    if (close(fd) < 0)
	status = SM_FAIL;

    return (status);
}

/*
 *  mdnblocks() -- Get the number of blocks stored in a relation.
 *
 *	Returns # of blocks or -1 on error.
 */

int
mdnblocks(reln)
    Relation reln;
{
    File vfd;

    vfd = RelationGetFile(reln);

    return (FileGetNumberOfBlocks(vfd));
}

/*
 *  mdcommit() -- Commit a transaction.
 *
 *	All changes to magnetic disk relations must be forced to stable
 *	storage.  This routine makes a pass over the private table of
 *	file descriptors.  Any descriptors to which we have done writes,
 *	but not synced, are synced here.
 *
 *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
 */

int
mdcommit()
{
    int i;

    for (i = 0; i < Nfds; i++) {
	if (Md_fdvec[i] == MDFD_DIRTY) {
	    if (FileSync(i) < 0)
		return (SM_FAIL);

	    Md_fdvec[i] = MDFD_CLEAN;
	}
    }

    return (SM_SUCCESS);
}

/*
 *  mdabort() -- Abort a transaction.
 *
 *	Changes need not be forced to disk at transaction abort.  We mark
 *	all file descriptors as clean here.  Always returns SM_SUCCESS.
 */

int
mdabort()
{
    bzero(Md_fdvec, Nfds * sizeof(int));
    return (SM_SUCCESS);
}

/*
 *  _fdvec_ext() -- Extend the md file descriptor vector.
 *
 *	The file descriptor vector must be large enough to hold at least
 *	'fd' entries.
 */

int
_fdvec_ext(fd)
    int fd;
{
    char *nvec;
    int orig;

    orig = Nfds;

    do
	Nfds *= 2;
    while (Nfds <= fd);

    if ((nvec = (char *) malloc(Nfds)) == (char *) NULL)
	return (SM_FAIL);

    (void) bzero(nvec, Nfds);
    (void) bcopy(Md_fdvec, nvec, orig);

    free(Md_fdvec);

    Md_fdvec = nvec;

    return (SM_SUCCESS);
}
