/*
 * Copyright (c) 1986 Regents of the University of California.
 * All rights reserved.  The Berkeley software License Agreement
 * specifies the terms and conditions for redistribution.
 *
 *	@(#)sys_inode.c	1.11 (2.11BSD) 1999/9/10
 */

#include "param.h"
#include "../machine/seg.h"

#include "user.h"
#include "proc.h"
#include "signalvar.h"
#include "inode.h"
#include "buf.h"
#include "fs.h"
#include "file.h"
#include "stat.h"
#include "mount.h"
#include "conf.h"
#include "uio.h"
#include "ioctl.h"
#include "tty.h"
#include "kernel.h"
#include "systm.h"
#include "syslog.h"
#ifdef QUOTA
#include "quota.h"
#endif

extern	int	vn_closefile();
int	ino_rw(), ino_ioctl(), ino_select();

struct 	fileops inodeops =
	{ ino_rw, ino_ioctl, ino_select, vn_closefile };

ino_rw(fp, uio)
	struct file *fp;
register struct uio *uio;
{
	register struct inode *ip = (struct inode *)fp->f_data;
	u_int count, error;
	int ioflag;

	if ((ip->i_mode&IFMT) != IFCHR)
		ILOCK(ip);
	uio->uio_offset = fp->f_offset;
	count = uio->uio_resid;
	if	(uio->uio_rw == UIO_READ)
		{
		error = rwip(ip, uio, fp->f_flag & FNONBLOCK ? IO_NDELAY : 0);
		fp->f_offset += (count - uio->uio_resid);
		}
	else
		{
		ioflag = 0;
		if	((ip->i_mode&IFMT) == IFREG && (fp->f_flag & FAPPEND))
			ioflag |= IO_APPEND;
		if	(fp->f_flag & FNONBLOCK)
			ioflag |= IO_NDELAY;
		if	(fp->f_flag & FFSYNC ||
			 (ip->i_fs->fs_flags & MNT_SYNCHRONOUS))
			ioflag |= IO_SYNC;
		error = rwip(ip, uio, ioflag);
		if	(ioflag & IO_APPEND)
			fp->f_offset = uio->uio_offset;
		else
			fp->f_offset += (count - uio->uio_resid);
		}
	if ((ip->i_mode&IFMT) != IFCHR)
		IUNLOCK(ip);
	return (error);
}

rdwri(rw, ip, base, len, offset, segflg, ioflg, aresid)
	enum uio_rw rw;
	struct inode *ip;
	caddr_t base;
	int len;
	off_t offset;
	enum uio_seg segflg;
	int ioflg;
register int *aresid;
{
	struct uio auio;
	struct iovec aiov;
register int error;

	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	aiov.iov_base = base;
	aiov.iov_len = len;
	auio.uio_rw = rw;
	auio.uio_resid = len;
	auio.uio_offset = offset;
	auio.uio_segflg = segflg;
	error = rwip(ip, &auio, ioflg);
	if (aresid)
		*aresid = auio.uio_resid;
	else
		if (auio.uio_resid)
			error = EIO;
	return (error);
}

rwip(ip, uio, ioflag)
	register struct inode *ip;
	register struct uio *uio;
	int ioflag;
{
	dev_t dev = (dev_t)ip->i_rdev;
	register struct buf *bp;
	off_t osize;
	daddr_t lbn, bn;
	int n, on, type, resid;
	int error = 0;
	int flags;

	if	(uio->uio_offset < 0)
		return (EINVAL);
	type = ip->i_mode&IFMT;
/*
 * The write case below checks that i/o is done synchronously to directories
 * and that i/o to append only files takes place at the end of file.
 * We do not panic on non-sync directory i/o - the sync bit is forced on.
*/
	if (uio->uio_rw == UIO_READ)
		{
		if	(!(ip->i_fs->fs_flags & MNT_NOATIME))
			ip->i_flag |= IACC;
		}
	else
	   {
	   switch (type)
		{
		case IFREG:
		    if	(ioflag & IO_APPEND)
			uio->uio_offset = ip->i_size;
		    if	(ip->i_flags & APPEND && uio->uio_offset != ip->i_size)
			return(EPERM);
		    break;
		case IFDIR:
		    if  ((ioflag & IO_SYNC) == 0)
			ioflag |= IO_SYNC;
		    break;
		case IFLNK:
		case IFBLK:
		case IFCHR:
		    break;
		default:
		    return(EFTYPE);
		}
	   }

/*
 * The IO_SYNC flag is turned off here if the 'async' mount flag is on.
 * Otherwise directory I/O (which is done by the kernel) would still 
 * synchronous (because the kernel carefully passes IO_SYNC for all directory
 * I/O) even if the fs was mounted with "-o async".  
 *
 * A side effect of this is that if the system administrator mounts a filesystem
 * 'async' then the O_FSYNC flag to open() is ignored.
 *
 * This behaviour should probably be selectable via "sysctl fs.async.dirs" and
 * "fs.async.ofsync".  A project for a rainy day.
*/
	if (type == IFREG  || type == IFDIR && (ip->i_fs->fs_flags & MNT_ASYNC))
		ioflag &= ~IO_SYNC;

	if (type == IFCHR)
		{
		if  (uio->uio_rw == UIO_READ)
		    {
		    if	(!(ip->i_fs->fs_flags & MNT_NOATIME))
			ip->i_flag |= IACC;
		    error = (*cdevsw[major(dev)].d_read)(dev, uio, ioflag);
		    }
		else
		    {
		    ip->i_flag |= IUPD|ICHG;
		    error = (*cdevsw[major(dev)].d_write)(dev, uio, ioflag);
		    }
		return (error);
		}
	if (uio->uio_resid == 0)
		return (0);
	if (uio->uio_rw == UIO_WRITE && type == IFREG &&
	    uio->uio_offset + uio->uio_resid >
	      u.u_rlimit[RLIMIT_FSIZE].rlim_cur) {
		psignal(u.u_procp, SIGXFSZ);
		return (EFBIG);
	}
#ifdef	QUOTA
	/*
	 * we do bytes, see the comment on 'blocks' in ino_stat().
	 *
	 * the simplfying assumption is made that the entire write will
	 * succeed, otherwise we have to check the quota on each block.
	 * can you say slow?  i knew you could.  SMS
	*/
	if ((type == IFREG || type == IFDIR || type == IFLNK) && 
	    uio->uio_rw == UIO_WRITE && !(ip->i_flag & IPIPE)) {
		if (uio->uio_offset + uio->uio_resid > ip->i_size) {
			QUOTAMAP();
			error = chkdq(ip, 
				uio->uio_offset+uio->uio_resid - ip->i_size,0);
			QUOTAUNMAP();
			if (error)
				return (error);
		}
	}
#endif
	if (type != IFBLK)
		dev = ip->i_dev;
	resid = uio->uio_resid;
	osize = ip->i_size;

	flags = ioflag & IO_SYNC ? B_SYNC : 0;

	do {
		lbn = lblkno(uio->uio_offset);
		on = blkoff(uio->uio_offset);
		n = MIN((u_int)(DEV_BSIZE - on), uio->uio_resid);
		if (type != IFBLK) {
			if (uio->uio_rw == UIO_READ) {
				off_t diff = ip->i_size - uio->uio_offset;
				if (diff <= 0)
					return (0);
				if (diff < n)
					n = diff;
			bn = bmap(ip, lbn, B_READ, flags);
			}
			else
				bn = bmap(ip,lbn,B_WRITE,
				       n == DEV_BSIZE ? flags : flags|B_CLRBUF);
			if (u.u_error || uio->uio_rw == UIO_WRITE && (long)bn<0)
				return (u.u_error);
			if (uio->uio_rw == UIO_WRITE && uio->uio_offset + n > ip->i_size &&
			   (type == IFDIR || type == IFREG || type == IFLNK))
				ip->i_size = uio->uio_offset + n;
		} else {
			bn = lbn;
			rablock = bn + 1;
		}
		if (uio->uio_rw == UIO_READ) {
			if ((long)bn<0) {
				bp = geteblk();
				clrbuf(bp);
			} else if (ip->i_lastr + 1 == lbn)
				bp = breada(dev, bn, rablock);
			else
				bp = bread(dev, bn);
			ip->i_lastr = lbn;
		} else {
			if (n == DEV_BSIZE) 
				bp = getblk(dev, bn);
			else
				bp = bread(dev, bn);
/*
 * 4.3 didn't do this, but 2.10 did.  not sure why.
 * something about tape drivers don't clear buffers on end-of-tape
 * any longer (clrbuf can't be called from interrupt).
*/
			if (bp->b_resid == DEV_BSIZE) {
				bp->b_resid = 0;
				clrbuf(bp);
			}
		}
		n = MIN(n, DEV_BSIZE - bp->b_resid);
		if (bp->b_flags & B_ERROR) {
			error = EIO;
			brelse(bp);
			break;
		}
		u.u_error = uiomove(mapin(bp)+on, n, uio);
		mapout(bp);
		if (uio->uio_rw == UIO_READ) {
			if (n + on == DEV_BSIZE || uio->uio_offset == ip->i_size) {
				bp->b_flags |= B_AGE;
				if (ip->i_flag & IPIPE)
					bp->b_flags &= ~B_DELWRI;
			}
			brelse(bp);
		} else {
			if (ioflag & IO_SYNC)
				bwrite(bp);
/*
 * The check below interacts _very_ badly with virtual memory tmp files
 * such as those used by 'ld'.   These files tend to be small and repeatedly
 * rewritten in 1kb chunks.  The check below causes the device driver to be
 * called (and I/O initiated)  constantly.  Not sure what to do about this yet
 * but this comment is being placed here as a reminder.
*/
			else if (n + on == DEV_BSIZE && !(ip->i_flag & IPIPE)) {
				bp->b_flags |= B_AGE;
				bawrite(bp);
			} else
				bdwrite(bp);
			ip->i_flag |= IUPD|ICHG;
			if (u.u_ruid != 0)
				ip->i_mode &= ~(ISUID|ISGID);
		}
	} while (u.u_error == 0 && uio->uio_resid && n != 0);
	if (error == 0)				/* XXX */
		error = u.u_error;		/* XXX */
	if (error && (uio->uio_rw == UIO_WRITE) && (ioflag & IO_UNIT) && 
		(type != IFBLK)) {
		itrunc(ip, osize, ioflag & IO_SYNC);
		uio->uio_offset -= (resid - uio->uio_resid);
		uio->uio_resid = resid;
/*
 * Should back out the change to the quota here but that would be a lot
 * of work for little benefit.  Besides we've already made the assumption
 * that the entire write would succeed and users can't turn on the IO_UNIT
 * bit for their writes anyways.
*/
	}
#ifdef whybother
	if (!error && (ioflag & IO_SYNC))
		IUPDAT(ip, &time, &time, 1);
#endif
	return (error);
}


ino_ioctl(fp, com, data)
	register struct file *fp;
	register u_int com;
	caddr_t data;
{
	register struct inode *ip = ((struct inode *)fp->f_data);
	dev_t dev;

	switch (ip->i_mode & IFMT) {

	case IFREG:
	case IFDIR:
		if (com == FIONREAD) {
			if (fp->f_type==DTYPE_PIPE && !(fp->f_flag&FREAD))
				*(off_t *)data = 0;
			else
				*(off_t *)data = ip->i_size - fp->f_offset;
			return (0);
		}
		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
			return (0);			/* XXX */
		/* fall into ... */

	default:
		return (ENOTTY);

	case IFCHR:
		dev = ip->i_rdev;
		u.u_r.r_val1 = 0;
		if	(setjmp(&u.u_qsave))
/*
 * The ONLY way we can get here is via the longjump in sleep.  Signals have
 * been checked for and u_error set accordingly.  All that remains to do 
 * is 'return'.
*/
			return(u.u_error);
		return((*cdevsw[major(dev)].d_ioctl)(dev,com,data,fp->f_flag));
	}
}

ino_select(fp, which)
	struct file *fp;
	int which;
{
	register struct inode *ip = (struct inode *)fp->f_data;
	register dev_t dev;

	switch (ip->i_mode & IFMT) {

	default:
		return (1);		/* XXX */

	case IFCHR:
		dev = ip->i_rdev;
		return (*cdevsw[major(dev)].d_select)(dev, which);
	}
}

ino_stat(ip, sb)
	register struct inode *ip;
	register struct stat *sb;
{
	register struct icommon2 *ic2;

#ifdef	EXTERNALITIMES
	mapseg5(xitimes, xitdesc);
	ic2 = &((struct icommon2 *)SEG5)[ip - inode];
#else
	ic2 = &ip->i_ic2;
#endif

/*
 * inlined ITIMES which takes advantage of the common times pointer.
*/
	if (ip->i_flag & (IUPD|IACC|ICHG)) {
		ip->i_flag |= IMOD;
		if (ip->i_flag & IACC)
			ic2->ic_atime = time.tv_sec;
		if (ip->i_flag & IUPD)
			ic2->ic_mtime = time.tv_sec;
		if (ip->i_flag & ICHG)
			ic2->ic_ctime = time.tv_sec;
		ip->i_flag &= ~(IUPD|IACC|ICHG);
	}
	sb->st_dev = ip->i_dev;
	sb->st_ino = ip->i_number;
	sb->st_mode = ip->i_mode;
	sb->st_nlink = ip->i_nlink;
	sb->st_uid = ip->i_uid;
	sb->st_gid = ip->i_gid;
	sb->st_rdev = (dev_t)ip->i_rdev;
	sb->st_size = ip->i_size;
	sb->st_atime = ic2->ic_atime;
	sb->st_spare1 = 0;
	sb->st_mtime = ic2->ic_mtime;
	sb->st_spare2 = 0;
	sb->st_ctime = ic2->ic_ctime;
	sb->st_spare3 = 0;
	sb->st_blksize = MAXBSIZE;
	/*
	 * blocks are too tough to do; it's not worth the effort.
	 */
	sb->st_blocks = btodb(ip->i_size + MAXBSIZE - 1);
	sb->st_flags = ip->i_flags;
	sb->st_spare4[0] = 0;
	sb->st_spare4[1] = 0;
	sb->st_spare4[2] = 0;
#ifdef	EXTERNALITIMES
	normalseg5();
#endif
	return (0);
}

/*
 * This routine, like its counterpart openi(), calls the device driver for
 * special (IBLK, ICHR) files.  Normal files simply return early (the default
 * case in the switch statement).  Pipes and sockets do NOT come here because
 * they have their own close routines.
*/

closei(ip, flag)
	register struct inode *ip;
	int	flag;
	{
	register struct mount *mp;
	register struct file *fp;
	int	mode, error;
	dev_t	dev;
	int	(*cfunc)();

	mode = ip->i_mode & IFMT;
	dev = ip->i_rdev;

	switch	(mode)
		{
		case	IFCHR:
			cfunc = cdevsw[major(dev)].d_close;
			break;
		case	IFBLK:
		/*
		 * We don't want to really close the device if it is mounted
		 */
/* MOUNT TABLE SHOULD HOLD INODE */
			for (mp = mount; mp < &mount[NMOUNT]; mp++)
				if (mp->m_inodp != NULL && mp->m_dev == dev)
					return;
			cfunc = bdevsw[major(dev)].d_close;
			break;
		default:
			return(0);
		}
	/*
	 * Check that another inode for the same device isn't active.
	 * This is because the same device can be referenced by two
	 * different inodes.
	 */
	for	(fp = file; fp < fileNFILE; fp++)
		{
		if (fp->f_type != DTYPE_INODE)
			continue;
		if (fp->f_count && (ip = (struct inode *)fp->f_data) &&
		    ip->i_rdev == dev && (ip->i_mode&IFMT) == mode)
			return(0);
		}
	if	(mode == IFBLK)
		{
		/*
		 * On last close of a block device (that isn't mounted)
		 * we must invalidate any in core blocks, so that
		 * we can, for instance, change floppy disks.
		 */
		bflush(dev);
		binval(dev);
		}
/*
 * NOTE:  none of the device drivers appear to either set u_error OR return 
 *	  anything meaningful from their close routines.  It's a good thing
 *	  programs don't bother checking the error status on close() calls.
 *	  Apparently the only time "errno" is meaningful after a "close" is
 *	  when the process is interrupted.
*/
	if	(setjmp(&u.u_qsave))
		{
		/*
		 * If device close routine is interrupted,
		 * must return so closef can clean up.
		 */
		if	((error = u.u_error) == 0)
			error = EINTR;
		}
	else
		error = (*cfunc)(dev, flag, mode);
	return(error);
	}

/*
 * Place an advisory lock on an inode.
 * NOTE: callers of this routine must be prepared to deal with the pseudo
 *       error return ERESTART.
 */
ino_lock(fp, cmd)
	register struct file *fp;
	int cmd;
{
	register int priority = PLOCK;
	register struct inode *ip = (struct inode *)fp->f_data;
	int error;

	if ((cmd & LOCK_EX) == 0)
		priority += 4;
/*
 * If there's a exclusive lock currently applied to the file then we've 
 * gotta wait for the lock with everyone else.
 *
 * NOTE:  We can NOT sleep on i_exlockc because it is on an odd byte boundary
 *	  and the low (oddness) bit is reserved for networking/supervisor mode
 *	  sleep channels.  Thus we always sleep on i_shlockc and simply check
 *	  the proper bits to see if the lock we want is granted.  This may 
 *	  mean an extra wakeup/sleep event is done once in a while but 
 *	  everything will work correctly.
*/
again:
	while (ip->i_flag & IEXLOCK) {
		/*
		 * If we're holding an exclusive
		 * lock, then release it.
		 */
		if (fp->f_flag & FEXLOCK) {
			ino_unlock(fp, FEXLOCK);
			continue;
		}
		if (cmd & LOCK_NB)
			return (EWOULDBLOCK);
		ip->i_flag |= ILWAIT;
		error = tsleep((caddr_t)&ip->i_shlockc, priority | PCATCH, 0);
		if	(error)
			return(error);
	}
	if ((cmd & LOCK_EX) && (ip->i_flag & ISHLOCK)) {
		/*
		 * Must wait for any shared locks to finish
		 * before we try to apply a exclusive lock.
		 *
		 * If we're holding a shared
		 * lock, then release it.
		 */
		if (fp->f_flag & FSHLOCK) {
			ino_unlock(fp, FSHLOCK);
			goto again;
		}
		if (cmd & LOCK_NB)
			return (EWOULDBLOCK);
		ip->i_flag |= ILWAIT;
		error = tsleep((caddr_t)&ip->i_shlockc, PLOCK | PCATCH, 0);
		if	(error)
			return(error);
		goto again;
	}
	if (cmd & LOCK_EX) {
		cmd &= ~LOCK_SH;
		ip->i_exlockc++;
		ip->i_flag |= IEXLOCK;
		fp->f_flag |= FEXLOCK;
	}
	if ((cmd & LOCK_SH) && (fp->f_flag & FSHLOCK) == 0) {
		ip->i_shlockc++;
		ip->i_flag |= ISHLOCK;
		fp->f_flag |= FSHLOCK;
	}
	return (0);
}

/*
 * Unlock a file.
 */
ino_unlock(fp, kind)
	register struct file *fp;
	int kind;
{
	register struct inode *ip = (struct inode *)fp->f_data;
	register int flags;

	kind &= fp->f_flag;
	if (ip == NULL || kind == 0)
		return;
	flags = ip->i_flag;
	if (kind & FSHLOCK) {
		if (--ip->i_shlockc == 0) {
			ip->i_flag &= ~ISHLOCK;
			if (flags & ILWAIT)
				wakeup((caddr_t)&ip->i_shlockc);
		}
		fp->f_flag &= ~FSHLOCK;
	}
	if (kind & FEXLOCK) {
		if (--ip->i_exlockc == 0) {
			ip->i_flag &= ~(IEXLOCK|ILWAIT);
			if (flags & ILWAIT)
				wakeup((caddr_t)&ip->i_shlockc);
		}
		fp->f_flag &= ~FEXLOCK;
	}
}

/*
 * Openi called to allow handler of special files to initialize and
 * validate before actual IO.
 */
openi(ip, mode)
	register struct inode *ip;
{
	register dev_t dev = ip->i_rdev;
	register int maj = major(dev);
	dev_t bdev;
	int error;

	switch (ip->i_mode&IFMT) {

	case IFCHR:
		if (ip->i_fs->fs_flags & MNT_NODEV)
			return(ENXIO);
		if ((u_int)maj >= nchrdev)
			return (ENXIO);
		if (mode & FWRITE) {
			/*
			 * When running in very secure mode, do not allow
			 * opens for writing of any disk character devices.
			 */
			if (securelevel >= 2 && isdisk(dev, IFCHR))
				return(EPERM);
			/*
			 * When running in secure mode, do not allow opens
			 * for writing of /dev/mem, /dev/kmem, or character
			 * devices whose corresponding block devices are
			 * currently mounted.
			 */
			if (securelevel >= 1) {
				if ((bdev = chrtoblk(dev)) != NODEV &&
					(error = ufs_mountedon(bdev)))
						return(error);
				if (iskmemdev(dev))
					return(EPERM);
			}
		}
		return ((*cdevsw[maj].d_open)(dev, mode, S_IFCHR));

	case IFBLK:
		if (ip->i_fs->fs_flags & MNT_NODEV)
			return(ENXIO);
		if ((u_int)maj >= nblkdev)
			return (ENXIO);
		/*
		 * When running in very secure mode, do not allow
		 * opens for writing of any disk block devices.
		 */
		if (securelevel >= 2 && (mode & FWRITE) && isdisk(dev, IFBLK))
			return(EPERM);
		/*
		 * Do not allow opens of block devices that are 
		 * currently mounted.
		 *
		 * 2.11BSD must relax this restriction to allow 'fsck' to
 		 * open the root filesystem (which is always mounted) during 
		 * a reboot.  Once in secure or very secure mode the 
		 * above restriction is fully effective.  On the otherhand
		 * fsck should 1) use the raw device, 2) not do sync calls...
		 */
		if (securelevel > 0 && (error = ufs_mountedon(dev)))
			return(error);
		return ((*bdevsw[maj].d_open)(dev, mode, S_IFBLK));
	}
	return (0);
}

/*
 * Revoke access the current tty by all processes.
 * Used only by the super-user in init
 * to give ``clean'' terminals at login.
 */
vhangup()
{

	if (!suser())
		return;
	if (u.u_ttyp == NULL)
		return;
	forceclose(u.u_ttyd);
	if ((u.u_ttyp->t_state) & TS_ISOPEN)
		gsignal(u.u_ttyp->t_pgrp, SIGHUP);
}

forceclose(dev)
	register dev_t dev;
{
	register struct file *fp;
	register struct inode *ip;

	for (fp = file; fp < fileNFILE; fp++) {
		if (fp->f_count == 0)
			continue;
		if (fp->f_type != DTYPE_INODE)
			continue;
		ip = (struct inode *)fp->f_data;
		if (ip == 0)
			continue;
		if ((ip->i_mode & IFMT) != IFCHR)
			continue;
		if (ip->i_rdev != dev)
			continue;
		fp->f_flag &= ~(FREAD|FWRITE);
	}
}