1 /*
2 * linux/fs/super.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * super.c contains code to handle: - mount structures
7 * - super-block tables
8 * - filesystem drivers list
9 * - mount system call
10 * - umount system call
11 * - ustat system call
12 *
13 * GK 2/5/95 - Changed to support mounting the root fs via NFS
14 *
15 * Added kerneld support: Jacques Gelinas and Bjorn Ekwall
16 * Added change_root: Werner Almesberger & Hans Lermen, Feb '96
17 * Added options to /proc/mounts:
18 * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
19 * Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
20 * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
21 */
22
23 #include <linux/config.h>
24 #include <linux/string.h>
25 #include <linux/malloc.h>
26 #include <linux/locks.h>
27 #include <linux/smp_lock.h>
28 #include <linux/devfs_fs_kernel.h>
29 #include <linux/fd.h>
30 #include <linux/init.h>
31 #include <linux/quotaops.h>
32 #include <linux/acct.h>
33
34 #include <asm/uaccess.h>
35
36 #include <linux/nfs_fs.h>
37 #include <linux/nfs_fs_sb.h>
38 #include <linux/nfs_mount.h>
39
40 #include <linux/kmod.h>
41 #define __NO_VERSION__
42 #include <linux/module.h>
43
44 /*
45 * We use a semaphore to synchronize all mount/umount
46 * activity - imagine the mess if we have a race between
47 * unmounting a filesystem and re-mounting it (or something
48 * else).
49 */
50 static DECLARE_MUTEX(mount_sem);
51
52 extern void wait_for_keypress(void);
53
54 extern int root_mountflags;
55
56 static int do_remount_sb(struct super_block *sb, int flags, char * data);
57
58 /* this is initialized in init/main.c */
59 kdev_t ROOT_DEV;
60
61 int nr_super_blocks;
62 int max_super_blocks = NR_SUPER;
63 LIST_HEAD(super_blocks);
64
65 /*
66 * Handling of filesystem drivers list.
67 * Rules:
68 * Inclusion to/removals from/scanning of list are protected by spinlock.
69 * During the unload module must call unregister_filesystem().
70 * We can access the fields of list element if:
71 * 1) spinlock is held or
72 * 2) we hold the reference to the module.
73 * The latter can be guaranteed by call of try_inc_mod_count(); if it
74 * returned 0 we must skip the element, otherwise we got the reference.
75 * Once the reference is obtained we can drop the spinlock.
76 */
77
78 static struct file_system_type *file_systems;
79 static rwlock_t file_systems_lock = RW_LOCK_UNLOCKED;
80
81 /* WARNING: This can be used only if we _already_ own a reference */
82 static void get_filesystem(struct file_system_type *fs)
83 {
84 if (fs->owner)
85 __MOD_INC_USE_COUNT(fs->owner);
86 }
87
88 static void put_filesystem(struct file_system_type *fs)
89 {
90 if (fs->owner)
91 __MOD_DEC_USE_COUNT(fs->owner);
92 }
93
94 static struct file_system_type **find_filesystem(const char *name)
95 {
96 struct file_system_type **p;
97 for (p=&file_systems; *p; p=&(*p)->next)
98 if (strcmp((*p)->name,name) == 0)
99 break;
100 return p;
101 }
102
103 /**
104 * register_filesystem - register a new filesystem
105 * @fs: the file system structure
106 *
107 * Adds the file system passed to the list of file systems the kernel
108 * is aware of for mount and other syscalls. Returns 0 on success,
109 * or a negative errno code on an error.
110 *
111 * The &struct file_system_type that is passed is linked into the kernel
112 * structures and must not be freed until the file system has been
113 * unregistered.
114 */
115
116 int register_filesystem(struct file_system_type * fs)
117 {
118 int res = 0;
119 struct file_system_type ** p;
120
121 if (!fs)
122 return -EINVAL;
123 if (fs->next)
124 return -EBUSY;
125 write_lock(&file_systems_lock);
126 p = find_filesystem(fs->name);
127 if (*p)
128 res = -EBUSY;
129 else
130 *p = fs;
131 write_unlock(&file_systems_lock);
132 return res;
133 }
134
135 /**
136 * unregister_filesystem - unregister a file system
137 * @fs: filesystem to unregister
138 *
139 * Remove a file system that was previously successfully registered
140 * with the kernel. An error is returned if the file system is not found.
141 * Zero is returned on a success.
142 *
143 * Once this function has returned the &struct file_system_type structure
144 * may be freed or reused.
145 */
146
147 int unregister_filesystem(struct file_system_type * fs)
148 {
149 struct file_system_type ** tmp;
150
151 write_lock(&file_systems_lock);
152 tmp = &file_systems;
153 while (*tmp) {
154 if (fs == *tmp) {
155 *tmp = fs->next;
156 fs->next = NULL;
157 write_unlock(&file_systems_lock);
158 return 0;
159 }
160 tmp = &(*tmp)->next;
161 }
162 write_unlock(&file_systems_lock);
163 return -EINVAL;
164 }
165
166 static int fs_index(const char * __name)
167 {
168 struct file_system_type * tmp;
169 char * name;
170 int err, index;
171
172 name = getname(__name);
173 err = PTR_ERR(name);
174 if (IS_ERR(name))
175 return err;
176
177 err = -EINVAL;
178 read_lock(&file_systems_lock);
179 for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
180 if (strcmp(tmp->name,name) == 0) {
181 err = index;
182 break;
183 }
184 }
185 read_unlock(&file_systems_lock);
186 putname(name);
187 return err;
188 }
189
190 static int fs_name(unsigned int index, char * buf)
191 {
192 struct file_system_type * tmp;
193 int len, res;
194
195 read_lock(&file_systems_lock);
196 for (tmp = file_systems; tmp; tmp = tmp->next, index--)
197 if (index <= 0 && try_inc_mod_count(tmp->owner))
198 break;
199 read_unlock(&file_systems_lock);
200 if (!tmp)
201 return -EINVAL;
202
203 /* OK, we got the reference, so we can safely block */
204 len = strlen(tmp->name) + 1;
205 res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
206 put_filesystem(tmp);
207 return res;
208 }
209
210 static int fs_maxindex(void)
211 {
212 struct file_system_type * tmp;
213 int index;
214
215 read_lock(&file_systems_lock);
216 for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
217 ;
218 read_unlock(&file_systems_lock);
219 return index;
220 }
221
222 /*
223 * Whee.. Weird sysv syscall.
224 */
225 asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2)
226 {
227 int retval = -EINVAL;
228
229 switch (option) {
230 case 1:
231 retval = fs_index((const char *) arg1);
232 break;
233
234 case 2:
235 retval = fs_name(arg1, (char *) arg2);
236 break;
237
238 case 3:
239 retval = fs_maxindex();
240 break;
241 }
242 return retval;
243 }
244
245 int get_filesystem_list(char * buf)
246 {
247 int len = 0;
248 struct file_system_type * tmp;
249
250 read_lock(&file_systems_lock);
251 tmp = file_systems;
252 while (tmp && len < PAGE_SIZE - 80) {
253 len += sprintf(buf+len, "%s\t%s\n",
254 (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
255 tmp->name);
256 tmp = tmp->next;
257 }
258 read_unlock(&file_systems_lock);
259 return len;
260 }
261
262 struct file_system_type *get_fs_type(const char *name)
263 {
264 struct file_system_type *fs;
265
266 read_lock(&file_systems_lock);
267 fs = *(find_filesystem(name));
268 if (fs && !try_inc_mod_count(fs->owner))
269 fs = NULL;
270 read_unlock(&file_systems_lock);
271 if (!fs && (request_module(name) == 0)) {
272 read_lock(&file_systems_lock);
273 fs = *(find_filesystem(name));
274 if (fs && !try_inc_mod_count(fs->owner))
275 fs = NULL;
276 read_unlock(&file_systems_lock);
277 }
278 return fs;
279 }
280
281 static LIST_HEAD(vfsmntlist);
282
283 /**
284 * add_vfsmnt - add a new mount node
285 * @nd: location of mountpoint or %NULL if we want a root node
286 * @root: root of (sub)tree to be mounted
287 * @dev_name: device name to show in /proc/mounts or %NULL (for "none").
288 *
289 * This is VFS idea of mount. New node is allocated, bound to a tree
290 * we are mounting and optionally (OK, usually) registered as mounted
291 * on a given mountpoint. Returns a pointer to new node or %NULL in
292 * case of failure.
293 *
294 * Potential reason for failure (aside of trivial lack of memory) is a
295 * deleted mountpoint. Caller must hold ->i_zombie on mountpoint
296 * dentry (if any).
297 *
298 * Node is marked as MNT_VISIBLE (visible in /proc/mounts) unless both
299 * @nd and @devname are %NULL. It works since we pass non-%NULL @devname
300 * when we are mounting root and kern_mount() filesystems are deviceless.
301 * If we will get a kern_mount() filesystem with nontrivial @devname we
302 * will have to pass the visibility flag explicitly, so if we will add
303 * support for such beasts we'll have to change prototype.
304 */
305
306 static struct vfsmount *add_vfsmnt(struct nameidata *nd,
307 struct dentry *root,
308 const char *dev_name)
309 {
310 struct vfsmount *mnt;
311 struct super_block *sb = root->d_inode->i_sb;
312 char *name;
313
314 mnt = kmalloc(sizeof(struct vfsmount), GFP_KERNEL);
315 if (!mnt)
316 goto out;
317 memset(mnt, 0, sizeof(struct vfsmount));
318
319 if (nd || dev_name)
320 mnt->mnt_flags = MNT_VISIBLE;
321
322 /* It may be NULL, but who cares? */
323 if (dev_name) {
324 name = kmalloc(strlen(dev_name)+1, GFP_KERNEL);
325 if (name) {
326 strcpy(name, dev_name);
327 mnt->mnt_devname = name;
328 }
329 }
330 mnt->mnt_owner = current->uid;
331 atomic_set(&mnt->mnt_count,1);
332 mnt->mnt_sb = sb;
333
334 spin_lock(&dcache_lock);
335 if (nd && !IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
336 goto fail;
337 mnt->mnt_root = dget(root);
338 mnt->mnt_mountpoint = nd ? dget(nd->dentry) : dget(root);
339 mnt->mnt_parent = nd ? mntget(nd->mnt) : mnt;
340
341 if (nd) {
342 list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts);
343 list_add(&mnt->mnt_clash, &nd->dentry->d_vfsmnt);
344 } else {
345 INIT_LIST_HEAD(&mnt->mnt_child);
346 INIT_LIST_HEAD(&mnt->mnt_clash);
347 }
348 INIT_LIST_HEAD(&mnt->mnt_mounts);
349 list_add(&mnt->mnt_instances, &sb->s_mounts);
350 list_add(&mnt->mnt_list, vfsmntlist.prev);
351 spin_unlock(&dcache_lock);
352 out:
353 return mnt;
354 fail:
355 spin_unlock(&dcache_lock);
356 if (mnt->mnt_devname)
357 kfree(mnt->mnt_devname);
358 kfree(mnt);
359 return NULL;
360 }
361
362 static void move_vfsmnt(struct vfsmount *mnt,
363 struct dentry *mountpoint,
364 struct vfsmount *parent,
365 const char *dev_name)
366 {
367 struct dentry *old_mountpoint;
368 struct vfsmount *old_parent;
369 char *new_devname = NULL;
370
371 if (dev_name) {
372 new_devname = kmalloc(strlen(dev_name)+1, GFP_KERNEL);
373 if (new_devname)
374 strcpy(new_devname, dev_name);
375 }
376
377 spin_lock(&dcache_lock);
378 old_mountpoint = mnt->mnt_mountpoint;
379 old_parent = mnt->mnt_parent;
380
381 /* flip names */
382 if (new_devname) {
383 if (mnt->mnt_devname)
384 kfree(mnt->mnt_devname);
385 mnt->mnt_devname = new_devname;
386 }
387
388 /* flip the linkage */
389 mnt->mnt_mountpoint = dget(mountpoint);
390 mnt->mnt_parent = parent ? mntget(parent) : mnt;
391 list_del(&mnt->mnt_clash);
392 list_del(&mnt->mnt_child);
393 if (parent) {
394 list_add(&mnt->mnt_child, &parent->mnt_mounts);
395 list_add(&mnt->mnt_clash, &mountpoint->d_vfsmnt);
396 } else {
397 INIT_LIST_HEAD(&mnt->mnt_child);
398 INIT_LIST_HEAD(&mnt->mnt_clash);
399 }
400 spin_unlock(&dcache_lock);
401
402 /* put the old stuff */
403 dput(old_mountpoint);
404 if (old_parent != mnt)
405 mntput(old_parent);
406 }
407
408 /*
409 * Called with spinlock held, releases it.
410 */
411 static void remove_vfsmnt(struct vfsmount *mnt)
412 {
413 /* First of all, remove it from all lists */
414 list_del(&mnt->mnt_instances);
415 list_del(&mnt->mnt_clash);
416 list_del(&mnt->mnt_list);
417 list_del(&mnt->mnt_child);
418 spin_unlock(&dcache_lock);
419 /* Now we can work safely */
420 if (mnt->mnt_parent != mnt)
421 mntput(mnt->mnt_parent);
422
423 dput(mnt->mnt_mountpoint);
424 dput(mnt->mnt_root);
425 if (mnt->mnt_devname)
426 kfree(mnt->mnt_devname);
427 kfree(mnt);
428 }
429
430
431 /* Use octal escapes, like mount does, for embedded spaces etc. */
432 static unsigned char need_escaping[] = { ' ', '\t', '\n', '\\' };
433
434 static int
435 mangle(const unsigned char *s, char *buf, int len) {
436 char *sp;
437 int n;
438
439 sp = buf;
440 while(*s && sp-buf < len-3) {
441 for (n = 0; n < sizeof(need_escaping); n++) {
442 if (*s == need_escaping[n]) {
443 *sp++ = '\\';
444 *sp++ = '' + ((*s & 0300) >> 6);
445 *sp++ = '' + ((*s & 070) >> 3);
446 *sp++ = '' + (*s & 07);
447 goto next;
448 }
449 }
450 *sp++ = *s;
451 next:
452 s++;
453 }
454 return sp - buf; /* no trailing NUL */
455 }
456
457 static struct proc_fs_info {
458 int flag;
459 char *str;
460 } fs_info[] = {
461 { MS_NOEXEC, ",noexec" },
462 { MS_NOSUID, ",nosuid" },
463 { MS_NODEV, ",nodev" },
464 { MS_SYNCHRONOUS, ",sync" },
465 { MS_MANDLOCK, ",mand" },
466 { MS_NOATIME, ",noatime" },
467 { MS_NODIRATIME, ",nodiratime" },
468 #ifdef MS_NOSUB /* Can't find this except in mount.c */
469 { MS_NOSUB, ",nosub" },
470 #endif
471 { 0, NULL }
472 };
473
474 static struct proc_nfs_info {
475 int flag;
476 char *str;
477 char *nostr;
478 } nfs_info[] = {
479 { NFS_MOUNT_SOFT, ",soft", ",hard" },
480 { NFS_MOUNT_INTR, ",intr", "" },
481 { NFS_MOUNT_POSIX, ",posix", "" },
482 { NFS_MOUNT_TCP, ",tcp", ",udp" },
483 { NFS_MOUNT_NOCTO, ",nocto", "" },
484 { NFS_MOUNT_NOAC, ",noac", "" },
485 { NFS_MOUNT_NONLM, ",nolock", ",lock" },
486 { NFS_MOUNT_BROKEN_SUID, ",broken_suid", "" },
487 { 0, NULL, NULL }
488 };
489
490 int get_filesystem_info( char *buf )
491 {
492 struct list_head *p;
493 struct proc_fs_info *fs_infop;
494 struct proc_nfs_info *nfs_infop;
495 struct nfs_server *nfss;
496 int len, prevlen;
497 char *path, *buffer = (char *) __get_free_page(GFP_KERNEL);
498
499 if (!buffer) return 0;
500 len = prevlen = 0;
501
502 #define FREEROOM ((int)PAGE_SIZE-200-len)
503 #define MANGLE(s) len += mangle((s), buf+len, FREEROOM);
504
505 for (p = vfsmntlist.next; p != &vfsmntlist; p = p->next) {
506 struct vfsmount *tmp = list_entry(p, struct vfsmount, mnt_list);
507 if (!(tmp->mnt_flags & MNT_VISIBLE))
508 continue;
509 path = d_path(tmp->mnt_root, tmp, buffer, PAGE_SIZE);
510 if (!path)
511 continue;
512 MANGLE(tmp->mnt_devname ? tmp->mnt_devname : "none");
513 buf[len++] = ' ';
514 MANGLE(path);
515 buf[len++] = ' ';
516 MANGLE(tmp->mnt_sb->s_type->name);
517 len += sprintf(buf+len, " %s",
518 tmp->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
519 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
520 if (tmp->mnt_sb->s_flags & fs_infop->flag)
521 MANGLE(fs_infop->str);
522 }
523 if (!strcmp("nfs", tmp->mnt_sb->s_type->name)) {
524 nfss = &tmp->mnt_sb->u.nfs_sb.s_server;
525 len += sprintf(buf+len, ",v%d", nfss->rpc_ops->version);
526
527 len += sprintf(buf+len, ",rsize=%d", nfss->rsize);
528
529 len += sprintf(buf+len, ",wsize=%d", nfss->wsize);
530 #if 0
531 if (nfss->timeo != 7*HZ/10) {
532 len += sprintf(buf+len, ",timeo=%d",
533 nfss->timeo*10/HZ);
534 }
535 if (nfss->retrans != 3) {
536 len += sprintf(buf+len, ",retrans=%d",
537 nfss->retrans);
538 }
539 #endif
540 if (nfss->acregmin != 3*HZ) {
541 len += sprintf(buf+len, ",acregmin=%d",
542 nfss->acregmin/HZ);
543 }
544 if (nfss->acregmax != 60*HZ) {
545 len += sprintf(buf+len, ",acregmax=%d",
546 nfss->acregmax/HZ);
547 }
548 if (nfss->acdirmin != 30*HZ) {
549 len += sprintf(buf+len, ",acdirmin=%d",
550 nfss->acdirmin/HZ);
551 }
552 if (nfss->acdirmax != 60*HZ) {
553 len += sprintf(buf+len, ",acdirmax=%d",
554 nfss->acdirmax/HZ);
555 }
556 for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
557 char *str;
558 if (nfss->flags & nfs_infop->flag)
559 str = nfs_infop->str;
560 else
561 str = nfs_infop->nostr;
562 MANGLE(str);
563 }
564 len += sprintf(buf+len, ",addr=");
565 MANGLE(nfss->hostname);
566 }
567 len += sprintf(buf + len, " 0 0\n");
568 if (FREEROOM <= 3) {
569 len = prevlen;
570 len += sprintf(buf+len, "# truncated\n");
571 break;
572 }
573 prevlen = len;
574 }
575
576 free_page((unsigned long) buffer);
577 return len;
578 #undef MANGLE
579 #undef FREEROOM
580 }
581
582 /**
583 * __wait_on_super - wait on a superblock
584 * @sb: superblock to wait on
585 *
586 * Waits for a superblock to become unlocked and then returns. It does
587 * not take the lock. This is an internal function. See wait_on_super().
588 */
589
590 void __wait_on_super(struct super_block * sb)
591 {
592 DECLARE_WAITQUEUE(wait, current);
593
594 add_wait_queue(&sb->s_wait, &wait);
595 repeat:
596 set_current_state(TASK_UNINTERRUPTIBLE);
597 if (sb->s_lock) {
598 schedule();
599 goto repeat;
600 }
601 remove_wait_queue(&sb->s_wait, &wait);
602 current->state = TASK_RUNNING;
603 }
604
605 /*
606 * Note: check the dirty flag before waiting, so we don't
607 * hold up the sync while mounting a device. (The newly
608 * mounted device won't need syncing.)
609 */
610 void sync_supers(kdev_t dev)
611 {
612 struct super_block * sb;
613
614 for (sb = sb_entry(super_blocks.next);
615 sb != sb_entry(&super_blocks);
616 sb = sb_entry(sb->s_list.next)) {
617 if (!sb->s_dev)
618 continue;
619 if (dev && sb->s_dev != dev)
620 continue;
621 if (!sb->s_dirt)
622 continue;
623 lock_super(sb);
624 if (sb->s_dev && sb->s_dirt && (!dev || dev == sb->s_dev))
625 if (sb->s_op && sb->s_op->write_super)
626 sb->s_op->write_super(sb);
627 unlock_super(sb);
628 }
629 }
630
631 /**
632 * get_super - get the superblock of a device
633 * @dev: device to get the superblock for
634 *
635 * Scans the superblock list and finds the superblock of the file system
636 * mounted on the device given. %NULL is returned if no match is found.
637 */
638
639 struct super_block * get_super(kdev_t dev)
640 {
641 struct super_block * s;
642
643 if (!dev)
644 return NULL;
645 restart:
646 s = sb_entry(super_blocks.next);
647 while (s != sb_entry(&super_blocks))
648 if (s->s_dev == dev) {
649 wait_on_super(s);
650 if (s->s_dev == dev)
651 return s;
652 goto restart;
653 } else
654 s = sb_entry(s->s_list.next);
655 return NULL;
656 }
657
658 asmlinkage long sys_ustat(dev_t dev, struct ustat * ubuf)
659 {
660 struct super_block *s;
661 struct ustat tmp;
662 struct statfs sbuf;
663 int err = -EINVAL;
664
665 lock_kernel();
666 s = get_super(to_kdev_t(dev));
667 unlock_kernel();
668 if (s == NULL)
669 goto out;
670 err = vfs_statfs(s, &sbuf);
671 if (err)
672 goto out;
673
674 memset(&tmp,0,sizeof(struct ustat));
675 tmp.f_tfree = sbuf.f_bfree;
676 tmp.f_tinode = sbuf.f_ffree;
677
678 err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
679 out:
680 return err;
681 }
682
683 /**
684 * get_empty_super - find empty superblocks
685 *
686 * Find a superblock with no device assigned. A free superblock is
687 * found and returned. If neccessary new superblocks are allocated.
688 * %NULL is returned if there are insufficient resources to complete
689 * the request.
690 */
691
692 struct super_block *get_empty_super(void)
693 {
694 struct super_block *s;
695
696 for (s = sb_entry(super_blocks.next);
697 s != sb_entry(&super_blocks);
698 s = sb_entry(s->s_list.next)) {
699 if (s->s_dev)
700 continue;
701 if (!s->s_lock)
702 return s;
703 printk("VFS: empty superblock %p locked!\n", s);
704 }
705 /* Need a new one... */
706 if (nr_super_blocks >= max_super_blocks)
707 return NULL;
708 s = kmalloc(sizeof(struct super_block), GFP_USER);
709 if (s) {
710 nr_super_blocks++;
711 memset(s, 0, sizeof(struct super_block));
712 INIT_LIST_HEAD(&s->s_dirty);
713 list_add (&s->s_list, super_blocks.prev);
714 init_waitqueue_head(&s->s_wait);
715 INIT_LIST_HEAD(&s->s_files);
716 INIT_LIST_HEAD(&s->s_mounts);
717 }
718 return s;
719 }
720
721 static struct super_block * read_super(kdev_t dev, struct block_device *bdev,
722 struct file_system_type *type, int flags,
723 void *data, int silent)
724 {
725 struct super_block * s;
726 s = get_empty_super();
727 if (!s)
728 goto out;
729 s->s_dev = dev;
730 s->s_bdev = bdev;
731 s->s_flags = flags;
732 s->s_dirt = 0;
733 sema_init(&s->s_vfs_rename_sem,1);
734 sema_init(&s->s_nfsd_free_path_sem,1);
735 s->s_type = type;
736 sema_init(&s->s_dquot.dqio_sem, 1);
737 sema_init(&s->s_dquot.dqoff_sem, 1);
738 s->s_dquot.flags = 0;
739 lock_super(s);
740 if (!type->read_super(s, data, silent))
741 goto out_fail;
742 unlock_super(s);
743 /* tell bdcache that we are going to keep this one */
744 if (bdev)
745 atomic_inc(&bdev->bd_count);
746 out:
747 return s;
748
749 out_fail:
750 s->s_dev = 0;
751 s->s_bdev = 0;
752 s->s_type = NULL;
753 unlock_super(s);
754 return NULL;
755 }
756
757 /*
758 * Unnamed block devices are dummy devices used by virtual
759 * filesystems which don't use real block-devices. -- jrs
760 */
761
762 static unsigned int unnamed_dev_in_use[256/(8*sizeof(unsigned int))];
763
764 kdev_t get_unnamed_dev(void)
765 {
766 int i;
767
768 for (i = 1; i < 256; i++) {
769 if (!test_and_set_bit(i,unnamed_dev_in_use))
770 return MKDEV(UNNAMED_MAJOR, i);
771 }
772 return 0;
773 }
774
775 void put_unnamed_dev(kdev_t dev)
776 {
777 if (!dev || MAJOR(dev) != UNNAMED_MAJOR)
778 return;
779 if (test_and_clear_bit(MINOR(dev), unnamed_dev_in_use))
780 return;
781 printk("VFS: put_unnamed_dev: freeing unused device %s\n",
782 kdevname(dev));
783 }
784
785 static struct super_block *get_sb_bdev(struct file_system_type *fs_type,
786 char *dev_name, int flags, void * data)
787 {
788 struct inode *inode;
789 struct block_device *bdev;
790 struct block_device_operations *bdops;
791 struct super_block * sb;
792 struct nameidata nd;
793 kdev_t dev;
794 int error = 0;
795 /* What device it is? */
796 if (!dev_name || !*dev_name)
797 return ERR_PTR(-EINVAL);
798 if (path_init(dev_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
799 error = path_walk(dev_name, &nd);
800 if (error)
801 return ERR_PTR(error);
802 inode = nd.dentry->d_inode;
803 error = -ENOTBLK;
804 if (!S_ISBLK(inode->i_mode))
805 goto out;
806 error = -EACCES;
807 if (IS_NODEV(inode))
808 goto out;
809 bdev = inode->i_bdev;
810 bdops = devfs_get_ops ( devfs_get_handle_from_inode (inode) );
811 if (bdops) bdev->bd_op = bdops;
812 /* Done with lookups, semaphore down */
813 down(&mount_sem);
814 dev = to_kdev_t(bdev->bd_dev);
815 sb = get_super(dev);
816 if (sb) {
817 if (fs_type == sb->s_type &&
818 ((flags ^ sb->s_flags) & MS_RDONLY) == 0) {
819 path_release(&nd);
820 return sb;
821 }
822 } else {
823 mode_t mode = FMODE_READ; /* we always need it ;-) */
824 if (!(flags & MS_RDONLY))
825 mode |= FMODE_WRITE;
826 error = blkdev_get(bdev, mode, 0, BDEV_FS);
827 if (error)
828 goto out;
829 check_disk_change(dev);
830 error = -EACCES;
831 if (!(flags & MS_RDONLY) && is_read_only(dev))
832 goto out1;
833 error = -EINVAL;
834 sb = read_super(dev, bdev, fs_type, flags, data, 0);
835 if (sb) {
836 get_filesystem(fs_type);
837 path_release(&nd);
838 return sb;
839 }
840 out1:
841 blkdev_put(bdev, BDEV_FS);
842 }
843 out:
844 path_release(&nd);
845 up(&mount_sem);
846 return ERR_PTR(error);
847 }
848
849 static struct super_block *get_sb_nodev(struct file_system_type *fs_type,
850 int flags, void * data)
851 {
852 kdev_t dev;
853 int error = -EMFILE;
854 down(&mount_sem);
855 dev = get_unnamed_dev();
856 if (dev) {
857 struct super_block * sb;
858 error = -EINVAL;
859 sb = read_super(dev, NULL, fs_type, flags, data, 0);
860 if (sb) {
861 get_filesystem(fs_type);
862 return sb;
863 }
864 put_unnamed_dev(dev);
865 }
866 up(&mount_sem);
867 return ERR_PTR(error);
868 }
869
870 static struct super_block *get_sb_single(struct file_system_type *fs_type,
871 int flags, void *data)
872 {
873 struct super_block * sb;
874 /*
875 * Get the superblock of kernel-wide instance, but
876 * keep the reference to fs_type.
877 */
878 down(&mount_sem);
879 sb = fs_type->kern_mnt->mnt_sb;
880 if (!sb)
881 BUG();
882 get_filesystem(fs_type);
883 do_remount_sb(sb, flags, data);
884 return sb;
885 }
886
887 static void kill_super(struct super_block *sb, int umount_root)
888 {
889 struct block_device *bdev;
890 kdev_t dev;
891 struct dentry *root = sb->s_root;
892 struct file_system_type *fs = sb->s_type;
893 struct super_operations *sop = sb->s_op;
894
895 sb->s_root = NULL;
896 /* Need to clean after the sucker */
897 if (fs->fs_flags & FS_LITTER)
898 d_genocide(root);
899 if (fs->fs_flags & (FS_SINGLE|FS_LITTER))
900 shrink_dcache_parent(root);
901 dput(root);
902 lock_super(sb);
903 if (sop) {
904 if (sop->write_super && sb->s_dirt)
905 sop->write_super(sb);
906 if (sop->put_super)
907 sop->put_super(sb);
908 }
909
910 /* Forget any remaining inodes */
911 if (invalidate_inodes(sb)) {
912 printk("VFS: Busy inodes after unmount. "
913 "Self-destruct in 5 seconds. Have a nice day...\n");
914 }
915
916 dev = sb->s_dev;
917 sb->s_dev = 0; /* Free the superblock */
918 bdev = sb->s_bdev;
919 sb->s_bdev = NULL;
920 put_filesystem(fs);
921 sb->s_type = NULL;
922 unlock_super(sb);
923 if (umount_root) {
924 /* special: the old device driver is going to be
925 a ramdisk and the point of this call is to free its
926 protected memory (even if dirty). */
927 destroy_buffers(dev);
928 }
929 if (bdev) {
930 blkdev_put(bdev, BDEV_FS);
931 bdput(bdev);
932 } else
933 put_unnamed_dev(dev);
934 }
935
936 /*
937 * Alters the mount flags of a mounted file system. Only the mount point
938 * is used as a reference - file system type and the device are ignored.
939 */
940
941 static int do_remount_sb(struct super_block *sb, int flags, char *data)
942 {
943 int retval;
944
945 if (!(flags & MS_RDONLY) && sb->s_dev && is_read_only(sb->s_dev))
946 return -EACCES;
947 /*flags |= MS_RDONLY;*/
948 /* If we are remounting RDONLY, make sure there are no rw files open */
949 if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY))
950 if (!fs_may_remount_ro(sb))
951 return -EBUSY;
952 if (sb->s_op && sb->s_op->remount_fs) {
953 lock_super(sb);
954 retval = sb->s_op->remount_fs(sb, &flags, data);
955 unlock_super(sb);
956 if (retval)
957 return retval;
958 }
959 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
960
961 /*
962 * We can't invalidate inodes as we can loose data when remounting
963 * (someone might manage to alter data while we are waiting in lock_super()
964 * or in foo_remount_fs()))
965 */
966
967 return 0;
968 }
969
970 struct vfsmount *kern_mount(struct file_system_type *type)
971 {
972 kdev_t dev = get_unnamed_dev();
973 struct super_block *sb;
974 struct vfsmount *mnt;
975 if (!dev)
976 return ERR_PTR(-EMFILE);
977 sb = read_super(dev, NULL, type, 0, NULL, 0);
978 if (!sb) {
979 put_unnamed_dev(dev);
980 return ERR_PTR(-EINVAL);
981 }
982 mnt = add_vfsmnt(NULL, sb->s_root, NULL);
983 if (!mnt) {
984 kill_super(sb, 0);
985 return ERR_PTR(-ENOMEM);
986 }
987 type->kern_mnt = mnt;
988 return mnt;
989 }
990
991 /* Call only after unregister_filesystem() - it's a final cleanup */
992
993 void kern_umount(struct vfsmount *mnt)
994 {
995 struct super_block *sb = mnt->mnt_sb;
996 spin_lock(&dcache_lock);
997 remove_vfsmnt(mnt);
998 kill_super(sb, 0);
999 }
1000
1001 /*
1002 * Doesn't take quota and stuff into account. IOW, in some cases it will
1003 * give false negatives. The main reason why it's here is that we need
1004 * a non-destructive way to look for easily umountable filesystems.
1005 */
1006 int may_umount(struct vfsmount *mnt)
1007 {
1008 if (atomic_read(&mnt->mnt_count) > 2)
1009 return -EBUSY;
1010 return 0;
1011 }
1012
1013 static int do_umount(struct vfsmount *mnt, int umount_root, int flags)
1014 {
1015 struct super_block * sb = mnt->mnt_sb;
1016
1017 /*
1018 * No sense to grab the lock for this test, but test itself looks
1019 * somewhat bogus. Suggestions for better replacement?
1020 * Ho-hum... In principle, we might treat that as umount + switch
1021 * to rootfs. GC would eventually take care of the old vfsmount.
1022 * The problem being: we have to implement rootfs and GC for that ;-)
1023 * Actually it makes sense, especially if rootfs would contain a
1024 * /reboot - static binary that would close all descriptors and
1025 * call reboot(9). Then init(8) could umount root and exec /reboot.
1026 */
1027 if (mnt == current->fs->rootmnt && !umount_root) {
1028 int retval = 0;
1029 /*
1030 * Special case for "unmounting" root ...
1031 * we just try to remount it readonly.
1032 */
1033 mntput(mnt);
1034 if (!(sb->s_flags & MS_RDONLY))
1035 retval = do_remount_sb(sb, MS_RDONLY, 0);
1036 return retval;
1037 }
1038
1039 spin_lock(&dcache_lock);
1040
1041 if (mnt->mnt_instances.next != mnt->mnt_instances.prev) {
1042 if (atomic_read(&mnt->mnt_count) > 2) {
1043 spin_unlock(&dcache_lock);
1044 mntput(mnt);
1045 return -EBUSY;
1046 }
1047 if (sb->s_type->fs_flags & FS_SINGLE)
1048 put_filesystem(sb->s_type);
1049 /* We hold two references, so mntput() is safe */
1050 mntput(mnt);
1051 remove_vfsmnt(mnt);
1052 return 0;
1053 }
1054 spin_unlock(&dcache_lock);
1055
1056 /*
1057 * Before checking whether the filesystem is still busy,
1058 * make sure the kernel doesn't hold any quota files open
1059 * on the device. If the umount fails, too bad -- there
1060 * are no quotas running any more. Just turn them on again.
1061 */
1062 DQUOT_OFF(sb);
1063 acct_auto_close(sb->s_dev);
1064
1065 /*
1066 * If we may have to abort operations to get out of this
1067 * mount, and they will themselves hold resources we must
1068 * allow the fs to do things. In the Unix tradition of
1069 * 'Gee thats tricky lets do it in userspace' the umount_begin
1070 * might fail to complete on the first run through as other tasks
1071 * must return, and the like. Thats for the mount program to worry
1072 * about for the moment.
1073 */
1074
1075 if( (flags&MNT_FORCE) && sb->s_op->umount_begin)
1076 sb->s_op->umount_begin(sb);
1077
1078 /*
1079 * Shrink dcache, then fsync. This guarantees that if the
1080 * filesystem is quiescent at this point, then (a) only the
1081 * root entry should be in use and (b) that root entry is
1082 * clean.
1083 */
1084 shrink_dcache_sb(sb);
1085 fsync_dev(sb->s_dev);
1086
1087 if (sb->s_root->d_inode->i_state) {
1088 mntput(mnt);
1089 return -EBUSY;
1090 }
1091
1092 /* Something might grab it again - redo checks */
1093
1094 spin_lock(&dcache_lock);
1095 if (atomic_read(&mnt->mnt_count) > 2) {
1096 spin_unlock(&dcache_lock);
1097 mntput(mnt);
1098 return -EBUSY;
1099 }
1100
1101 /* OK, that's the point of no return */
1102 mntput(mnt);
1103 remove_vfsmnt(mnt);
1104
1105 kill_super(sb, umount_root);
1106 return 0;
1107 }
1108
1109 /*
1110 * Now umount can handle mount points as well as block devices.
1111 * This is important for filesystems which use unnamed block devices.
1112 *
1113 * We now support a flag for forced unmount like the other 'big iron'
1114 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
1115 */
1116
1117 asmlinkage long sys_umount(char * name, int flags)
1118 {
1119 struct nameidata nd;
1120 char *kname;
1121 int retval;
1122
1123 lock_kernel();
1124 kname = getname(name);
1125 retval = PTR_ERR(kname);
1126 if (IS_ERR(kname))
1127 goto out;
1128 retval = 0;
1129 if (path_init(kname, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &nd))
1130 retval = path_walk(kname, &nd);
1131 putname(kname);
1132 if (retval)
1133 goto out;
1134 retval = -EINVAL;
1135 if (nd.dentry != nd.mnt->mnt_root)
1136 goto dput_and_out;
1137
1138 retval = -EPERM;
1139 if (!capable(CAP_SYS_ADMIN) && current->uid!=nd.mnt->mnt_owner)
1140 goto dput_and_out;
1141
1142 dput(nd.dentry);
1143 /* puts nd.mnt */
1144 down(&mount_sem);
1145 retval = do_umount(nd.mnt, 0, flags);
1146 up(&mount_sem);
1147 goto out;
1148 dput_and_out:
1149 path_release(&nd);
1150 out:
1151 unlock_kernel();
1152 return retval;
1153 }
1154
1155 /*
1156 * The 2.0 compatible umount. No flags.
1157 */
1158
1159 asmlinkage long sys_oldumount(char * name)
1160 {
1161 return sys_umount(name,0);
1162 }
1163
1164 static int mount_is_safe(struct nameidata *nd)
1165 {
1166 if (capable(CAP_SYS_ADMIN))
1167 return 0;
1168 return -EPERM;
1169 #ifdef notyet
1170 if (S_ISLNK(nd->dentry->d_inode->i_mode))
1171 return -EPERM;
1172 if (nd->dentry->d_inode->i_mode & S_ISVTX) {
1173 if (current->uid != nd->dentry->d_inode->i_uid)
1174 return -EPERM;
1175 }
1176 if (permission(nd->dentry->d_inode, MAY_WRITE))
1177 return -EPERM;
1178 return 0;
1179 #endif
1180 }
1181
1182 /*
1183 * do loopback mount.
1184 */
1185 static int do_loopback(char *old_name, char *new_name)
1186 {
1187 struct nameidata old_nd, new_nd;
1188 int err = 0;
1189 if (!old_name || !*old_name)
1190 return -EINVAL;
1191 if (path_init(old_name, LOOKUP_POSITIVE, &old_nd))
1192 err = path_walk(old_name, &old_nd);
1193 if (err)
1194 goto out;
1195 if (path_init(new_name, LOOKUP_POSITIVE, &new_nd))
1196 err = path_walk(new_name, &new_nd);
1197 if (err)
1198 goto out1;
1199 err = mount_is_safe(&new_nd);
1200 if (err)
1201 goto out2;
1202 err = -EINVAL;
1203 if (S_ISDIR(new_nd.dentry->d_inode->i_mode) !=
1204 S_ISDIR(old_nd.dentry->d_inode->i_mode))
1205 goto out2;
1206
1207 err = -ENOMEM;
1208 if (old_nd.mnt->mnt_sb->s_type->fs_flags & FS_SINGLE)
1209 get_filesystem(old_nd.mnt->mnt_sb->s_type);
1210
1211 down(&mount_sem);
1212 /* there we go */
1213 down(&new_nd.dentry->d_inode->i_zombie);
1214 if (IS_DEADDIR(new_nd.dentry->d_inode))
1215 err = -ENOENT;
1216 else if (add_vfsmnt(&new_nd, old_nd.dentry, old_nd.mnt->mnt_devname))
1217 err = 0;
1218 up(&new_nd.dentry->d_inode->i_zombie);
1219 up(&mount_sem);
1220 if (err && old_nd.mnt->mnt_sb->s_type->fs_flags & FS_SINGLE)
1221 put_filesystem(old_nd.mnt->mnt_sb->s_type);
1222 out2:
1223 path_release(&new_nd);
1224 out1:
1225 path_release(&old_nd);
1226 out:
1227 return err;
1228 }
1229
1230 /*
1231 * change filesystem flags. dir should be a physical root of filesystem.
1232 * If you've mounted a non-root directory somewhere and want to do remount
1233 * on it - tough luck.
1234 */
1235
1236 static int do_remount(const char *dir,int flags,char *data)
1237 {
1238 struct nameidata nd;
1239 int retval = 0;
1240
1241 if (!capable(CAP_SYS_ADMIN))
1242 return -EPERM;
1243
1244 if (path_init(dir, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
1245 retval = path_walk(dir, &nd);
1246 if (!retval) {
1247 struct super_block * sb = nd.dentry->d_inode->i_sb;
1248 retval = -ENODEV;
1249 if (sb) {
1250 retval = -EINVAL;
1251 if (nd.dentry == sb->s_root) {
1252 /*
1253 * Shrink the dcache and sync the device.
1254 */
1255 shrink_dcache_sb(sb);
1256 fsync_dev(sb->s_dev);
1257 if (flags & MS_RDONLY)
1258 acct_auto_close(sb->s_dev);
1259 retval = do_remount_sb(sb, flags, data);
1260 }
1261 }
1262 path_release(&nd);
1263 }
1264 return retval;
1265 }
1266
1267 static int copy_mount_options (const void *data, unsigned long *where)
1268 {
1269 int i;
1270 unsigned long page;
1271 unsigned long size;
1272
1273 *where = 0;
1274 if (!data)
1275 return 0;
1276
1277 if (!(page = __get_free_page(GFP_KERNEL)))
1278 return -ENOMEM;
1279
1280 /* We only care that *some* data at the address the user
1281 * gave us is valid. Just in case, we'll zero
1282 * the remainder of the page.
1283 */
1284 /* copy_from_user cannot cross TASK_SIZE ! */
1285 size = TASK_SIZE - (unsigned long)data;
1286 if (size > PAGE_SIZE)
1287 size = PAGE_SIZE;
1288
1289 i = size - copy_from_user((void *)page, data, size);
1290 if (!i) {
1291 free_page(page);
1292 return -EFAULT;
1293 }
1294 if (i != PAGE_SIZE)
1295 memset((char *)page + i, 0, PAGE_SIZE - i);
1296 *where = page;
1297 return 0;
1298 }
1299
1300 /*
1301 * Flags is a 16-bit value that allows up to 16 non-fs dependent flags to
1302 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
1303 *
1304 * data is a (void *) that can point to any structure up to
1305 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
1306 * information (or be NULL).
1307 *
1308 * NOTE! As pre-0.97 versions of mount() didn't use this setup, the
1309 * flags used to have a special 16-bit magic number in the high word:
1310 * 0xC0ED. If this magic number is present, the high word is discarded.
1311 */
1312 long do_mount(char * dev_name, char * dir_name, char *type_page,
1313 unsigned long flags, void *data_page)
1314 {
1315 struct file_system_type * fstype;
1316 struct nameidata nd;
1317 struct vfsmount *mnt = NULL;
1318 struct super_block *sb;
1319 int retval = 0;
1320
1321 /* Discard magic */
1322 if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
1323 flags &= ~MS_MGC_MSK;
1324
1325 /* Basic sanity checks */
1326
1327 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
1328 return -EINVAL;
1329 if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
1330 return -EINVAL;
1331
1332 /* OK, looks good, now let's see what do they want */
1333
1334 /* just change the flags? - capabilities are checked in do_remount() */
1335 if (flags & MS_REMOUNT)
1336 return do_remount(dir_name, flags & ~MS_REMOUNT,
1337 (char *) data_page);
1338
1339 /* "mount --bind"? Equivalent to older "mount -t bind" */
1340 /* No capabilities? What if users do thousands of these? */
1341 if (flags & MS_BIND)
1342 return do_loopback(dev_name, dir_name);
1343
1344 /* For the rest we need the type */
1345
1346 if (!type_page || !memchr(type_page, 0, PAGE_SIZE))
1347 return -EINVAL;
1348
1349 #if 0 /* Can be deleted again. Introduced in patch-2.3.99-pre6 */
1350 /* loopback mount? This is special - requires fewer capabilities */
1351 if (strcmp(type_page, "bind")==0)
1352 return do_loopback(dev_name, dir_name);
1353 #endif
1354
1355 /* for the rest we _really_ need capabilities... */
1356 if (!capable(CAP_SYS_ADMIN))
1357 return -EPERM;
1358
1359 /* ... filesystem driver... */
1360 fstype = get_fs_type(type_page);
1361 if (!fstype)
1362 return -ENODEV;
1363
1364 /* ... and mountpoint. Do the lookup first to force automounting. */
1365 if (path_init(dir_name,
1366 LOOKUP_FOLLOW|LOOKUP_POSITIVE|LOOKUP_DIRECTORY, &nd))
1367 retval = path_walk(dir_name, &nd);
1368 if (retval)
1369 goto fs_out;
1370
1371 /* get superblock, locks mount_sem on success */
1372 if (fstype->fs_flags & FS_NOMOUNT)
1373 sb = ERR_PTR(-EINVAL);
1374 else if (fstype->fs_flags & FS_REQUIRES_DEV)
1375 sb = get_sb_bdev(fstype, dev_name, flags, data_page);
1376 else if (fstype->fs_flags & FS_SINGLE)
1377 sb = get_sb_single(fstype, flags, data_page);
1378 else
1379 sb = get_sb_nodev(fstype, flags, data_page);
1380
1381 retval = PTR_ERR(sb);
1382 if (IS_ERR(sb))
1383 goto dput_out;
1384
1385 /* Something was mounted here while we slept */
1386 while(d_mountpoint(nd.dentry) && follow_down(&nd.mnt, &nd.dentry))
1387 ;
1388
1389 /* Refuse the same filesystem on the same mount point */
1390 retval = -EBUSY;
1391 if (nd.mnt && nd.mnt->mnt_sb == sb
1392 && nd.mnt->mnt_root == nd.dentry)
1393 goto fail;
1394
1395 retval = -ENOENT;
1396 if (!nd.dentry->d_inode)
1397 goto fail;
1398 down(&nd.dentry->d_inode->i_zombie);
1399 if (!IS_DEADDIR(nd.dentry->d_inode)) {
1400 retval = -ENOMEM;
1401 mnt = add_vfsmnt(&nd, sb->s_root, dev_name);
1402 }
1403 up(&nd.dentry->d_inode->i_zombie);
1404 if (!mnt)
1405 goto fail;
1406 retval = 0;
1407 unlock_out:
1408 up(&mount_sem);
1409 dput_out:
1410 path_release(&nd);
1411 fs_out:
1412 put_filesystem(fstype);
1413 return retval;
1414
1415 fail:
1416 if (list_empty(&sb->s_mounts))
1417 kill_super(sb, 0);
1418 goto unlock_out;
1419 }
1420
1421 asmlinkage long sys_mount(char * dev_name, char * dir_name, char * type,
1422 unsigned long flags, void * data)
1423 {
1424 int retval;
1425 unsigned long data_page;
1426 unsigned long type_page;
1427 unsigned long dev_page;
1428 char *dir_page;
1429
1430 retval = copy_mount_options (type, &type_page);
1431 if (retval < 0)
1432 return retval;
1433
1434 dir_page = getname(dir_name);
1435 retval = PTR_ERR(dir_page);
1436 if (IS_ERR(dir_page))
1437 goto out1;
1438
1439 retval = copy_mount_options (dev_name, &dev_page);
1440 if (retval < 0)
1441 goto out2;
1442
1443 retval = copy_mount_options (data, &data_page);
1444 if (retval < 0)
1445 goto out3;
1446
1447 lock_kernel();
1448 retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
1449 flags, (void*)data_page);
1450 unlock_kernel();
1451 free_page(data_page);
1452
1453 out3:
1454 free_page(dev_page);
1455 out2:
1456 putname(dir_page);
1457 out1:
1458 free_page(type_page);
1459 return retval;
1460 }
1461
1462 void __init mount_root(void)
1463 {
1464 struct file_system_type * fs_type;
1465 struct super_block * sb;
1466 struct vfsmount *vfsmnt;
1467 struct block_device *bdev = NULL;
1468 mode_t mode;
1469 int retval;
1470 void *handle;
1471 char path[64];
1472 int path_start = -1;
1473
1474 #ifdef CONFIG_ROOT_NFS
1475 void *data;
1476 if (MAJOR(ROOT_DEV) != UNNAMED_MAJOR)
1477 goto skip_nfs;
1478 fs_type = get_fs_type("nfs");
1479 if (!fs_type)
1480 goto no_nfs;
1481 ROOT_DEV = get_unnamed_dev();
1482 if (!ROOT_DEV)
1483 /*
1484 * Your /linuxrc sucks worse than MSExchange - that's the
1485 * only way you could run out of anon devices at that point.
1486 */
1487 goto no_anon;
1488 data = nfs_root_data();
1489 if (!data)
1490 goto no_server;
1491 sb = read_super(ROOT_DEV, NULL, fs_type, root_mountflags, data, 1);
1492 if (sb)
1493 /*
1494 * We _can_ fail there, but if that will happen we have no
1495 * chance anyway (no memory for vfsmnt and we _will_ need it,
1496 * no matter which fs we try to mount).
1497 */
1498 goto mount_it;
1499 no_server:
1500 put_unnamed_dev(ROOT_DEV);
1501 no_anon:
1502 put_filesystem(fs_type);
1503 no_nfs:
1504 printk(KERN_ERR "VFS: Unable to mount root fs via NFS, trying floppy.\n");
1505 ROOT_DEV = MKDEV(FLOPPY_MAJOR, 0);
1506 skip_nfs:
1507 #endif
1508
1509 #ifdef CONFIG_BLK_DEV_FD
1510 if (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) {
1511 #ifdef CONFIG_BLK_DEV_RAM
1512 extern int rd_doload;
1513 extern void rd_load_secondary(void);
1514 #endif
1515 floppy_eject();
1516 #ifndef CONFIG_BLK_DEV_RAM
1517 printk(KERN_NOTICE "(Warning, this kernel has no ramdisk support)\n");
1518 #else
1519 /* rd_doload is 2 for a dual initrd/ramload setup */
1520 if(rd_doload==2)
1521 rd_load_secondary();
1522 else
1523 #endif
1524 {
1525 printk(KERN_NOTICE "VFS: Insert root floppy and press ENTER\n");
1526 wait_for_keypress();
1527 }
1528 }
1529 #endif
1530
1531 devfs_make_root (root_device_name);
1532 handle = devfs_find_handle (NULL, ROOT_DEVICE_NAME,
1533 MAJOR (ROOT_DEV), MINOR (ROOT_DEV),
1534 DEVFS_SPECIAL_BLK, 1);
1535 if (handle) /* Sigh: bd*() functions only paper over the cracks */
1536 {
1537 unsigned major, minor;
1538
1539 devfs_get_maj_min (handle, &major, &minor);
1540 ROOT_DEV = MKDEV (major, minor);
1541 }
1542
1543 /*
1544 * Probably pure paranoia, but I'm less than happy about delving into
1545 * devfs crap and checking it right now. Later.
1546 */
1547 if (!ROOT_DEV)
1548 panic("I have no root and I want to scream");
1549
1550 bdev = bdget(kdev_t_to_nr(ROOT_DEV));
1551 if (!bdev)
1552 panic(__FUNCTION__ ": unable to allocate root device");
1553 bdev->bd_op = devfs_get_ops (handle);
1554 path_start = devfs_generate_path (handle, path + 5, sizeof (path) - 5);
1555 mode = FMODE_READ;
1556 if (!(root_mountflags & MS_RDONLY))
1557 mode |= FMODE_WRITE;
1558 retval = blkdev_get(bdev, mode, 0, BDEV_FS);
1559 if (retval == -EROFS) {
1560 root_mountflags |= MS_RDONLY;
1561 retval = blkdev_get(bdev, FMODE_READ, 0, BDEV_FS);
1562 }
1563 if (retval) {
1564 /*
1565 * Allow the user to distinguish between failed open
1566 * and bad superblock on root device.
1567 */
1568 printk ("VFS: Cannot open root device \"%s\" or %s\n",
1569 root_device_name, kdevname (ROOT_DEV));
1570 printk ("Please append a correct \"root=\" boot option\n");
1571 panic("VFS: Unable to mount root fs on %s",
1572 kdevname(ROOT_DEV));
1573 }
1574
1575 check_disk_change(ROOT_DEV);
1576 sb = get_super(ROOT_DEV);
1577 if (sb) {
1578 fs_type = sb->s_type;
1579 goto mount_it;
1580 }
1581
1582 read_lock(&file_systems_lock);
1583 for (fs_type = file_systems ; fs_type ; fs_type = fs_type->next) {
1584 if (!(fs_type->fs_flags & FS_REQUIRES_DEV))
1585 continue;
1586 if (!try_inc_mod_count(fs_type->owner))
1587 continue;
1588 read_unlock(&file_systems_lock);
1589 sb = read_super(ROOT_DEV,bdev,fs_type,root_mountflags,NULL,1);
1590 if (sb)
1591 goto mount_it;
1592 read_lock(&file_systems_lock);
1593 put_filesystem(fs_type);
1594 }
1595 read_unlock(&file_systems_lock);
1596 panic("VFS: Unable to mount root fs on %s", kdevname(ROOT_DEV));
1597
1598 mount_it:
1599 printk ("VFS: Mounted root (%s filesystem)%s.\n",
1600 fs_type->name,
1601 (sb->s_flags & MS_RDONLY) ? " readonly" : "");
1602 if (path_start >= 0) {
1603 devfs_mk_symlink (NULL, "root", DEVFS_FL_DEFAULT,
1604 path + 5 + path_start, NULL, NULL);
1605 memcpy (path + path_start, "/dev/", 5);
1606 vfsmnt = add_vfsmnt(NULL, sb->s_root, path + path_start);
1607 }
1608 else
1609 vfsmnt = add_vfsmnt(NULL, sb->s_root, "/dev/root");
1610 /* FIXME: if something will try to umount us right now... */
1611 if (vfsmnt) {
1612 set_fs_root(current->fs, vfsmnt, sb->s_root);
1613 set_fs_pwd(current->fs, vfsmnt, sb->s_root);
1614 if (bdev)
1615 bdput(bdev); /* sb holds a reference */
1616 return;
1617 }
1618 panic("VFS: add_vfsmnt failed for root fs");
1619 }
1620
1621
1622 static void chroot_fs_refs(struct dentry *old_root,
1623 struct vfsmount *old_rootmnt,
1624 struct dentry *new_root,
1625 struct vfsmount *new_rootmnt)
1626 {
1627 struct task_struct *p;
1628 struct fs_struct *fs;
1629
1630 read_lock(&tasklist_lock);
1631 for_each_task(p) {
1632 task_lock(p);
1633 fs = p->fs;
1634 if (fs) {
1635 atomic_inc(&fs->count);
1636 task_unlock(p);
1637 if (fs->root==old_root && fs->rootmnt==old_rootmnt)
1638 set_fs_root(fs, new_rootmnt, new_root);
1639 if (fs->pwd==old_root && fs->pwdmnt==old_rootmnt)
1640 set_fs_pwd(fs, new_rootmnt, new_root);
1641 put_fs_struct(fs);
1642 } else
1643 task_unlock(p);
1644 }
1645 read_unlock(&tasklist_lock);
1646 }
1647
1648 /*
1649 * Moves the current root to put_root, and sets root/cwd of all processes
1650 * which had them on the old root to new_root.
1651 *
1652 * Note:
1653 * - we don't move root/cwd if they are not at the root (reason: if something
1654 * cared enough to change them, it's probably wrong to force them elsewhere)
1655 * - it's okay to pick a root that isn't the root of a file system, e.g.
1656 * /nfs/my_root where /nfs is the mount point. Better avoid creating
1657 * unreachable mount points this way, though.
1658 */
1659
1660 asmlinkage long sys_pivot_root(const char *new_root, const char *put_old)
1661 {
1662 struct dentry *root;
1663 struct vfsmount *root_mnt;
1664 struct vfsmount *tmp;
1665 struct nameidata new_nd, old_nd;
1666 char *name;
1667 int error;
1668
1669 if (!capable(CAP_SYS_ADMIN))
1670 return -EPERM;
1671
1672 lock_kernel();
1673
1674 name = getname(new_root);
1675 error = PTR_ERR(name);
1676 if (IS_ERR(name))
1677 goto out0;
1678 error = 0;
1679 if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd))
1680 error = path_walk(name, &new_nd);
1681 putname(name);
1682 if (error)
1683 goto out0;
1684
1685 name = getname(put_old);
1686 error = PTR_ERR(name);
1687 if (IS_ERR(name))
1688 goto out0;
1689 error = 0;
1690 if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd))
1691 error = path_walk(name, &old_nd);
1692 putname(name);
1693 if (error)
1694 goto out1;
1695
1696 read_lock(¤t->fs->lock);
1697 root_mnt = mntget(current->fs->rootmnt);
1698 root = dget(current->fs->root);
1699 read_unlock(¤t->fs->lock);
1700 down(&mount_sem);
1701 down(&old_nd.dentry->d_inode->i_zombie);
1702 error = -ENOENT;
1703 if (IS_DEADDIR(new_nd.dentry->d_inode))
1704 goto out2;
1705 if (d_unhashed(new_nd.dentry) && !IS_ROOT(new_nd.dentry))
1706 goto out2;
1707 if (d_unhashed(old_nd.dentry) && !IS_ROOT(old_nd.dentry))
1708 goto out2;
1709 error = -EBUSY;
1710 if (new_nd.mnt == root_mnt || old_nd.mnt == root_mnt)
1711 goto out2; /* loop */
1712 error = -EINVAL;
1713 tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
1714 spin_lock(&dcache_lock);
1715 if (tmp != new_nd.mnt) {
1716 for (;;) {
1717 if (tmp->mnt_parent == tmp)
1718 goto out3;
1719 if (tmp->mnt_parent == new_nd.mnt)
1720 break;
1721 tmp = tmp->mnt_parent;
1722 }
1723 if (!is_subdir(tmp->mnt_mountpoint, new_nd.dentry))
1724 goto out3;
1725 } else if (!is_subdir(old_nd.dentry, new_nd.dentry))
1726 goto out3;
1727 spin_unlock(&dcache_lock);
1728
1729 move_vfsmnt(new_nd.mnt, new_nd.dentry, NULL, NULL);
1730 move_vfsmnt(root_mnt, old_nd.dentry, old_nd.mnt, NULL);
1731 chroot_fs_refs(root,root_mnt,new_nd.dentry,new_nd.mnt);
1732 error = 0;
1733 out2:
1734 up(&old_nd.dentry->d_inode->i_zombie);
1735 up(&mount_sem);
1736 dput(root);
1737 mntput(root_mnt);
1738 path_release(&old_nd);
1739 out1:
1740 path_release(&new_nd);
1741 out0:
1742 unlock_kernel();
1743 return error;
1744 out3:
1745 spin_unlock(&dcache_lock);
1746 goto out2;
1747 }
1748
1749
1750 #ifdef CONFIG_BLK_DEV_INITRD
1751
1752 int __init change_root(kdev_t new_root_dev,const char *put_old)
1753 {
1754 struct vfsmount *old_rootmnt;
1755 struct nameidata devfs_nd, nd;
1756 int error = 0;
1757
1758 read_lock(¤t->fs->lock);
1759 old_rootmnt = mntget(current->fs->rootmnt);
1760 read_unlock(¤t->fs->lock);
1761 /* First unmount devfs if mounted */
1762 if (path_init("/dev", LOOKUP_FOLLOW|LOOKUP_POSITIVE, &devfs_nd))
1763 error = path_walk("/dev", &devfs_nd);
1764 if (!error) {
1765 if (devfs_nd.mnt->mnt_sb->s_magic == DEVFS_SUPER_MAGIC &&
1766 devfs_nd.dentry == devfs_nd.mnt->mnt_root) {
1767 dput(devfs_nd.dentry);
1768 down(&mount_sem);
1769 /* puts devfs_nd.mnt */
1770 do_umount(devfs_nd.mnt, 0, 0);
1771 up(&mount_sem);
1772 } else
1773 path_release(&devfs_nd);
1774 }
1775 ROOT_DEV = new_root_dev;
1776 mount_root();
1777 #if 1
1778 shrink_dcache();
1779 printk("change_root: old root has d_count=%d\n",
1780 atomic_read(&old_rootmnt->mnt_root->d_count));
1781 #endif
1782 mount_devfs_fs ();
1783 /*
1784 * Get the new mount directory
1785 */
1786 error = 0;
1787 if (path_init(put_old, LOOKUP_FOLLOW|LOOKUP_POSITIVE|LOOKUP_DIRECTORY, &nd))
1788 error = path_walk(put_old, &nd);
1789 if (error) {
1790 int blivet;
1791
1792 printk(KERN_NOTICE "Trying to unmount old root ... ");
1793 blivet = do_umount(old_rootmnt, 1, 0);
1794 if (!blivet) {
1795 printk("okay\n");
1796 return 0;
1797 }
1798 printk(KERN_ERR "error %d\n", blivet);
1799 return error;
1800 }
1801 /* FIXME: we should hold i_zombie on nd.dentry */
1802 move_vfsmnt(old_rootmnt, nd.dentry, nd.mnt, "/dev/root.old");
1803 mntput(old_rootmnt);
1804 path_release(&nd);
1805 return 0;
1806 }
1807
1808 #endif
1809
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.