/* * Copyright (c) 1986 Regents of the University of California. * All rights reserved. The Berkeley software License Agreement * specifies the terms and conditions for redistribution. * * @(#)sys_inode.c 1.11 (2.11BSD) 1999/9/10 */ #include "param.h" #include "../machine/seg.h" #include "user.h" #include "proc.h" #include "signalvar.h" #include "inode.h" #include "buf.h" #include "fs.h" #include "file.h" #include "stat.h" #include "mount.h" #include "conf.h" #include "uio.h" #include "ioctl.h" #include "tty.h" #include "kernel.h" #include "systm.h" #include "syslog.h" #ifdef QUOTA #include "quota.h" #endif extern int vn_closefile(); int ino_rw(), ino_ioctl(), ino_select(); struct fileops inodeops = { ino_rw, ino_ioctl, ino_select, vn_closefile }; ino_rw(fp, uio) struct file *fp; register struct uio *uio; { register struct inode *ip = (struct inode *)fp->f_data; u_int count, error; int ioflag; if ((ip->i_mode&IFMT) != IFCHR) ILOCK(ip); uio->uio_offset = fp->f_offset; count = uio->uio_resid; if (uio->uio_rw == UIO_READ) { error = rwip(ip, uio, fp->f_flag & FNONBLOCK ? IO_NDELAY : 0); fp->f_offset += (count - uio->uio_resid); } else { ioflag = 0; if ((ip->i_mode&IFMT) == IFREG && (fp->f_flag & FAPPEND)) ioflag |= IO_APPEND; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & FFSYNC || (ip->i_fs->fs_flags & MNT_SYNCHRONOUS)) ioflag |= IO_SYNC; error = rwip(ip, uio, ioflag); if (ioflag & IO_APPEND) fp->f_offset = uio->uio_offset; else fp->f_offset += (count - uio->uio_resid); } if ((ip->i_mode&IFMT) != IFCHR) IUNLOCK(ip); return (error); } rdwri(rw, ip, base, len, offset, segflg, ioflg, aresid) enum uio_rw rw; struct inode *ip; caddr_t base; int len; off_t offset; enum uio_seg segflg; int ioflg; register int *aresid; { struct uio auio; struct iovec aiov; register int error; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = base; aiov.iov_len = len; auio.uio_rw = rw; auio.uio_resid = len; auio.uio_offset = offset; auio.uio_segflg = segflg; error = rwip(ip, &auio, ioflg); if (aresid) *aresid = auio.uio_resid; else if (auio.uio_resid) error = EIO; return (error); } rwip(ip, uio, ioflag) register struct inode *ip; register struct uio *uio; int ioflag; { dev_t dev = (dev_t)ip->i_rdev; register struct buf *bp; off_t osize; daddr_t lbn, bn; int n, on, type, resid; int error = 0; int flags; if (uio->uio_offset < 0) return (EINVAL); type = ip->i_mode&IFMT; /* * The write case below checks that i/o is done synchronously to directories * and that i/o to append only files takes place at the end of file. * We do not panic on non-sync directory i/o - the sync bit is forced on. */ if (uio->uio_rw == UIO_READ) { if (!(ip->i_fs->fs_flags & MNT_NOATIME)) ip->i_flag |= IACC; } else { switch (type) { case IFREG: if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if (ip->i_flags & APPEND && uio->uio_offset != ip->i_size) return(EPERM); break; case IFDIR: if ((ioflag & IO_SYNC) == 0) ioflag |= IO_SYNC; break; case IFLNK: case IFBLK: case IFCHR: break; default: return(EFTYPE); } } /* * The IO_SYNC flag is turned off here if the 'async' mount flag is on. * Otherwise directory I/O (which is done by the kernel) would still * synchronous (because the kernel carefully passes IO_SYNC for all directory * I/O) even if the fs was mounted with "-o async". * * A side effect of this is that if the system administrator mounts a filesystem * 'async' then the O_FSYNC flag to open() is ignored. * * This behaviour should probably be selectable via "sysctl fs.async.dirs" and * "fs.async.ofsync". A project for a rainy day. */ if (type == IFREG || type == IFDIR && (ip->i_fs->fs_flags & MNT_ASYNC)) ioflag &= ~IO_SYNC; if (type == IFCHR) { if (uio->uio_rw == UIO_READ) { if (!(ip->i_fs->fs_flags & MNT_NOATIME)) ip->i_flag |= IACC; error = (*cdevsw[major(dev)].d_read)(dev, uio, ioflag); } else { ip->i_flag |= IUPD|ICHG; error = (*cdevsw[major(dev)].d_write)(dev, uio, ioflag); } return (error); } if (uio->uio_resid == 0) return (0); if (uio->uio_rw == UIO_WRITE && type == IFREG && uio->uio_offset + uio->uio_resid > u.u_rlimit[RLIMIT_FSIZE].rlim_cur) { psignal(u.u_procp, SIGXFSZ); return (EFBIG); } #ifdef QUOTA /* * we do bytes, see the comment on 'blocks' in ino_stat(). * * the simplfying assumption is made that the entire write will * succeed, otherwise we have to check the quota on each block. * can you say slow? i knew you could. SMS */ if ((type == IFREG || type == IFDIR || type == IFLNK) && uio->uio_rw == UIO_WRITE && !(ip->i_flag & IPIPE)) { if (uio->uio_offset + uio->uio_resid > ip->i_size) { QUOTAMAP(); error = chkdq(ip, uio->uio_offset+uio->uio_resid - ip->i_size,0); QUOTAUNMAP(); if (error) return (error); } } #endif if (type != IFBLK) dev = ip->i_dev; resid = uio->uio_resid; osize = ip->i_size; flags = ioflag & IO_SYNC ? B_SYNC : 0; do { lbn = lblkno(uio->uio_offset); on = blkoff(uio->uio_offset); n = MIN((u_int)(DEV_BSIZE - on), uio->uio_resid); if (type != IFBLK) { if (uio->uio_rw == UIO_READ) { off_t diff = ip->i_size - uio->uio_offset; if (diff <= 0) return (0); if (diff < n) n = diff; bn = bmap(ip, lbn, B_READ, flags); } else bn = bmap(ip,lbn,B_WRITE, n == DEV_BSIZE ? flags : flags|B_CLRBUF); if (u.u_error || uio->uio_rw == UIO_WRITE && (long)bn<0) return (u.u_error); if (uio->uio_rw == UIO_WRITE && uio->uio_offset + n > ip->i_size && (type == IFDIR || type == IFREG || type == IFLNK)) ip->i_size = uio->uio_offset + n; } else { bn = lbn; rablock = bn + 1; } if (uio->uio_rw == UIO_READ) { if ((long)bn<0) { bp = geteblk(); clrbuf(bp); } else if (ip->i_lastr + 1 == lbn) bp = breada(dev, bn, rablock); else bp = bread(dev, bn); ip->i_lastr = lbn; } else { if (n == DEV_BSIZE) bp = getblk(dev, bn); else bp = bread(dev, bn); /* * 4.3 didn't do this, but 2.10 did. not sure why. * something about tape drivers don't clear buffers on end-of-tape * any longer (clrbuf can't be called from interrupt). */ if (bp->b_resid == DEV_BSIZE) { bp->b_resid = 0; clrbuf(bp); } } n = MIN(n, DEV_BSIZE - bp->b_resid); if (bp->b_flags & B_ERROR) { error = EIO; brelse(bp); break; } u.u_error = uiomove(mapin(bp)+on, n, uio); mapout(bp); if (uio->uio_rw == UIO_READ) { if (n + on == DEV_BSIZE || uio->uio_offset == ip->i_size) { bp->b_flags |= B_AGE; if (ip->i_flag & IPIPE) bp->b_flags &= ~B_DELWRI; } brelse(bp); } else { if (ioflag & IO_SYNC) bwrite(bp); /* * The check below interacts _very_ badly with virtual memory tmp files * such as those used by 'ld'. These files tend to be small and repeatedly * rewritten in 1kb chunks. The check below causes the device driver to be * called (and I/O initiated) constantly. Not sure what to do about this yet * but this comment is being placed here as a reminder. */ else if (n + on == DEV_BSIZE && !(ip->i_flag & IPIPE)) { bp->b_flags |= B_AGE; bawrite(bp); } else bdwrite(bp); ip->i_flag |= IUPD|ICHG; if (u.u_ruid != 0) ip->i_mode &= ~(ISUID|ISGID); } } while (u.u_error == 0 && uio->uio_resid && n != 0); if (error == 0) /* XXX */ error = u.u_error; /* XXX */ if (error && (uio->uio_rw == UIO_WRITE) && (ioflag & IO_UNIT) && (type != IFBLK)) { itrunc(ip, osize, ioflag & IO_SYNC); uio->uio_offset -= (resid - uio->uio_resid); uio->uio_resid = resid; /* * Should back out the change to the quota here but that would be a lot * of work for little benefit. Besides we've already made the assumption * that the entire write would succeed and users can't turn on the IO_UNIT * bit for their writes anyways. */ } #ifdef whybother if (!error && (ioflag & IO_SYNC)) IUPDAT(ip, &time, &time, 1); #endif return (error); } ino_ioctl(fp, com, data) register struct file *fp; register u_int com; caddr_t data; { register struct inode *ip = ((struct inode *)fp->f_data); dev_t dev; switch (ip->i_mode & IFMT) { case IFREG: case IFDIR: if (com == FIONREAD) { if (fp->f_type==DTYPE_PIPE && !(fp->f_flag&FREAD)) *(off_t *)data = 0; else *(off_t *)data = ip->i_size - fp->f_offset; return (0); } if (com == FIONBIO || com == FIOASYNC) /* XXX */ return (0); /* XXX */ /* fall into ... */ default: return (ENOTTY); case IFCHR: dev = ip->i_rdev; u.u_r.r_val1 = 0; if (setjmp(&u.u_qsave)) /* * The ONLY way we can get here is via the longjump in sleep. Signals have * been checked for and u_error set accordingly. All that remains to do * is 'return'. */ return(u.u_error); return((*cdevsw[major(dev)].d_ioctl)(dev,com,data,fp->f_flag)); } } ino_select(fp, which) struct file *fp; int which; { register struct inode *ip = (struct inode *)fp->f_data; register dev_t dev; switch (ip->i_mode & IFMT) { default: return (1); /* XXX */ case IFCHR: dev = ip->i_rdev; return (*cdevsw[major(dev)].d_select)(dev, which); } } ino_stat(ip, sb) register struct inode *ip; register struct stat *sb; { register struct icommon2 *ic2; #ifdef EXTERNALITIMES mapseg5(xitimes, xitdesc); ic2 = &((struct icommon2 *)SEG5)[ip - inode]; #else ic2 = &ip->i_ic2; #endif /* * inlined ITIMES which takes advantage of the common times pointer. */ if (ip->i_flag & (IUPD|IACC|ICHG)) { ip->i_flag |= IMOD; if (ip->i_flag & IACC) ic2->ic_atime = time.tv_sec; if (ip->i_flag & IUPD) ic2->ic_mtime = time.tv_sec; if (ip->i_flag & ICHG) ic2->ic_ctime = time.tv_sec; ip->i_flag &= ~(IUPD|IACC|ICHG); } sb->st_dev = ip->i_dev; sb->st_ino = ip->i_number; sb->st_mode = ip->i_mode; sb->st_nlink = ip->i_nlink; sb->st_uid = ip->i_uid; sb->st_gid = ip->i_gid; sb->st_rdev = (dev_t)ip->i_rdev; sb->st_size = ip->i_size; sb->st_atime = ic2->ic_atime; sb->st_spare1 = 0; sb->st_mtime = ic2->ic_mtime; sb->st_spare2 = 0; sb->st_ctime = ic2->ic_ctime; sb->st_spare3 = 0; sb->st_blksize = MAXBSIZE; /* * blocks are too tough to do; it's not worth the effort. */ sb->st_blocks = btodb(ip->i_size + MAXBSIZE - 1); sb->st_flags = ip->i_flags; sb->st_spare4[0] = 0; sb->st_spare4[1] = 0; sb->st_spare4[2] = 0; #ifdef EXTERNALITIMES normalseg5(); #endif return (0); } /* * This routine, like its counterpart openi(), calls the device driver for * special (IBLK, ICHR) files. Normal files simply return early (the default * case in the switch statement). Pipes and sockets do NOT come here because * they have their own close routines. */ closei(ip, flag) register struct inode *ip; int flag; { register struct mount *mp; register struct file *fp; int mode, error; dev_t dev; int (*cfunc)(); mode = ip->i_mode & IFMT; dev = ip->i_rdev; switch (mode) { case IFCHR: cfunc = cdevsw[major(dev)].d_close; break; case IFBLK: /* * We don't want to really close the device if it is mounted */ /* MOUNT TABLE SHOULD HOLD INODE */ for (mp = mount; mp < &mount[NMOUNT]; mp++) if (mp->m_inodp != NULL && mp->m_dev == dev) return; cfunc = bdevsw[major(dev)].d_close; break; default: return(0); } /* * Check that another inode for the same device isn't active. * This is because the same device can be referenced by two * different inodes. */ for (fp = file; fp < fileNFILE; fp++) { if (fp->f_type != DTYPE_INODE) continue; if (fp->f_count && (ip = (struct inode *)fp->f_data) && ip->i_rdev == dev && (ip->i_mode&IFMT) == mode) return(0); } if (mode == IFBLK) { /* * On last close of a block device (that isn't mounted) * we must invalidate any in core blocks, so that * we can, for instance, change floppy disks. */ bflush(dev); binval(dev); } /* * NOTE: none of the device drivers appear to either set u_error OR return * anything meaningful from their close routines. It's a good thing * programs don't bother checking the error status on close() calls. * Apparently the only time "errno" is meaningful after a "close" is * when the process is interrupted. */ if (setjmp(&u.u_qsave)) { /* * If device close routine is interrupted, * must return so closef can clean up. */ if ((error = u.u_error) == 0) error = EINTR; } else error = (*cfunc)(dev, flag, mode); return(error); } /* * Place an advisory lock on an inode. * NOTE: callers of this routine must be prepared to deal with the pseudo * error return ERESTART. */ ino_lock(fp, cmd) register struct file *fp; int cmd; { register int priority = PLOCK; register struct inode *ip = (struct inode *)fp->f_data; int error; if ((cmd & LOCK_EX) == 0) priority += 4; /* * If there's a exclusive lock currently applied to the file then we've * gotta wait for the lock with everyone else. * * NOTE: We can NOT sleep on i_exlockc because it is on an odd byte boundary * and the low (oddness) bit is reserved for networking/supervisor mode * sleep channels. Thus we always sleep on i_shlockc and simply check * the proper bits to see if the lock we want is granted. This may * mean an extra wakeup/sleep event is done once in a while but * everything will work correctly. */ again: while (ip->i_flag & IEXLOCK) { /* * If we're holding an exclusive * lock, then release it. */ if (fp->f_flag & FEXLOCK) { ino_unlock(fp, FEXLOCK); continue; } if (cmd & LOCK_NB) return (EWOULDBLOCK); ip->i_flag |= ILWAIT; error = tsleep((caddr_t)&ip->i_shlockc, priority | PCATCH, 0); if (error) return(error); } if ((cmd & LOCK_EX) && (ip->i_flag & ISHLOCK)) { /* * Must wait for any shared locks to finish * before we try to apply a exclusive lock. * * If we're holding a shared * lock, then release it. */ if (fp->f_flag & FSHLOCK) { ino_unlock(fp, FSHLOCK); goto again; } if (cmd & LOCK_NB) return (EWOULDBLOCK); ip->i_flag |= ILWAIT; error = tsleep((caddr_t)&ip->i_shlockc, PLOCK | PCATCH, 0); if (error) return(error); goto again; } if (cmd & LOCK_EX) { cmd &= ~LOCK_SH; ip->i_exlockc++; ip->i_flag |= IEXLOCK; fp->f_flag |= FEXLOCK; } if ((cmd & LOCK_SH) && (fp->f_flag & FSHLOCK) == 0) { ip->i_shlockc++; ip->i_flag |= ISHLOCK; fp->f_flag |= FSHLOCK; } return (0); } /* * Unlock a file. */ ino_unlock(fp, kind) register struct file *fp; int kind; { register struct inode *ip = (struct inode *)fp->f_data; register int flags; kind &= fp->f_flag; if (ip == NULL || kind == 0) return; flags = ip->i_flag; if (kind & FSHLOCK) { if (--ip->i_shlockc == 0) { ip->i_flag &= ~ISHLOCK; if (flags & ILWAIT) wakeup((caddr_t)&ip->i_shlockc); } fp->f_flag &= ~FSHLOCK; } if (kind & FEXLOCK) { if (--ip->i_exlockc == 0) { ip->i_flag &= ~(IEXLOCK|ILWAIT); if (flags & ILWAIT) wakeup((caddr_t)&ip->i_shlockc); } fp->f_flag &= ~FEXLOCK; } } /* * Openi called to allow handler of special files to initialize and * validate before actual IO. */ openi(ip, mode) register struct inode *ip; { register dev_t dev = ip->i_rdev; register int maj = major(dev); dev_t bdev; int error; switch (ip->i_mode&IFMT) { case IFCHR: if (ip->i_fs->fs_flags & MNT_NODEV) return(ENXIO); if ((u_int)maj >= nchrdev) return (ENXIO); if (mode & FWRITE) { /* * When running in very secure mode, do not allow * opens for writing of any disk character devices. */ if (securelevel >= 2 && isdisk(dev, IFCHR)) return(EPERM); /* * When running in secure mode, do not allow opens * for writing of /dev/mem, /dev/kmem, or character * devices whose corresponding block devices are * currently mounted. */ if (securelevel >= 1) { if ((bdev = chrtoblk(dev)) != NODEV && (error = ufs_mountedon(bdev))) return(error); if (iskmemdev(dev)) return(EPERM); } } return ((*cdevsw[maj].d_open)(dev, mode, S_IFCHR)); case IFBLK: if (ip->i_fs->fs_flags & MNT_NODEV) return(ENXIO); if ((u_int)maj >= nblkdev) return (ENXIO); /* * When running in very secure mode, do not allow * opens for writing of any disk block devices. */ if (securelevel >= 2 && (mode & FWRITE) && isdisk(dev, IFBLK)) return(EPERM); /* * Do not allow opens of block devices that are * currently mounted. * * 2.11BSD must relax this restriction to allow 'fsck' to * open the root filesystem (which is always mounted) during * a reboot. Once in secure or very secure mode the * above restriction is fully effective. On the otherhand * fsck should 1) use the raw device, 2) not do sync calls... */ if (securelevel > 0 && (error = ufs_mountedon(dev))) return(error); return ((*bdevsw[maj].d_open)(dev, mode, S_IFBLK)); } return (0); } /* * Revoke access the current tty by all processes. * Used only by the super-user in init * to give ``clean'' terminals at login. */ vhangup() { if (!suser()) return; if (u.u_ttyp == NULL) return; forceclose(u.u_ttyd); if ((u.u_ttyp->t_state) & TS_ISOPEN) gsignal(u.u_ttyp->t_pgrp, SIGHUP); } forceclose(dev) register dev_t dev; { register struct file *fp; register struct inode *ip; for (fp = file; fp < fileNFILE; fp++) { if (fp->f_count == 0) continue; if (fp->f_type != DTYPE_INODE) continue; ip = (struct inode *)fp->f_data; if (ip == 0) continue; if ((ip->i_mode & IFMT) != IFCHR) continue; if (ip->i_rdev != dev) continue; fp->f_flag &= ~(FREAD|FWRITE); } }