root/libdb/bt_open.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. __bt_open
  2. nroot
  3. tmp
  4. byteorder
  5. __bt_fd

   1 /*-
   2  * Copyright (c) 1990, 1993, 1994
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * Mike Olson.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 3. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  */
  32 
  33 #if defined(LIBC_SCCS) && !defined(lint)
  34 static char sccsid[] = "@(#)bt_open.c   8.10 (Berkeley) 8/17/94";
  35 #endif /* LIBC_SCCS and not lint */
  36 
  37 /**
  38  * @file
  39  * Implementation of btree access method for 4.4BSD.
  40  *
  41  * The design here was originally based on that of the btree access method
  42  * used in the Postgres database system at UC Berkeley.  This implementation
  43  * is wholly independent of the Postgres code.
  44  */
  45 
  46 #ifdef HAVE_CONFIG_H
  47 #include <config.h>
  48 #endif
  49 #include <sys/stat.h>
  50 
  51 #include <errno.h>
  52 #include <fcntl.h>
  53 #ifdef HAVE_LIMITS_H
  54 #include <limits.h>
  55 #endif
  56 #include <signal.h>
  57 #include <stdio.h>
  58 #ifdef STDC_HEADERS
  59 #include <stdlib.h>
  60 #endif
  61 #ifdef HAVE_STRING_H
  62 #include <string.h>
  63 #else
  64 #include <strings.h>
  65 #endif
  66 #ifdef HAVE_UNISTD_H
  67 #include <unistd.h>
  68 #endif
  69 
  70 #if (defined(_WIN32) && !defined(__CYGWIN__))
  71 #include <share.h>
  72 #define mkstemp(p) open(_mktemp(p), _O_CREAT | _O_RDWR | _O_BINARY | _O_TEMPORARY, _S_IWRITE | _SH_DENYRW)
  73 #endif
  74 
  75 #include "db.h"
  76 #include "btree.h"
  77 
  78 #ifdef DEBUG
  79 #undef  MINPSIZE
  80 #define MINPSIZE        128
  81 #endif
  82 
  83 static int byteorder(void);
  84 static int nroot(BTREE *);
  85 static int tmp(void);
  86 
  87 /**
  88  * __BT_OPEN -- Open a btree.
  89  *
  90  * Creates and fills a #DB struct, and calls the routine that actually
  91  * opens the btree.
  92  *
  93  *      @param fname    filename (@CODE{NULL} for in-memory trees)
  94  *      @param flags    open flag bits
  95  *      @param mode     open permission bits
  96  *      @param openinfo #BTREEINFO pointer
  97  *      @param dflags
  98  *
  99  * @return @CODE{NULL} on failure, pointer to #DB on success.
 100  *
 101  */
 102 DB *
 103 __bt_open(fname, flags, mode, openinfo, dflags)
 104         const char *fname;
 105         int flags, mode, dflags;
 106         const BTREEINFO *openinfo;
 107 {
 108         struct stat sb;
 109         BTMETA m;
 110         BTREE *t;
 111         BTREEINFO b;
 112         DB *dbp;
 113         pgno_t ncache;
 114         ssize_t nr;
 115         int machine_lorder;
 116 
 117         t = NULL;
 118 
 119         /*
 120          * Intention is to make sure all of the user's selections are okay
 121          * here and then use them without checking.  Can't be complete, since
 122          * we don't know the right page size, lorder or flags until the backing
 123          * file is opened.  Also, the file's page size can cause the cachesize
 124          * to change.
 125          */
 126         machine_lorder = byteorder();
 127         if (openinfo) {
 128                 b = *openinfo;
 129 
 130                 /* Flags: R_DUP. */
 131                 if (b.flags & ~(R_DUP))
 132                         goto einval;
 133 
 134                 /*
 135                  * Page size must be indx_t aligned and >= MINPSIZE.  Default
 136                  * page size is set farther on, based on the underlying file
 137                  * transfer size.
 138                  */
 139                 if (b.psize &&
 140                     (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 ||
 141                     b.psize & (sizeof(indx_t) - 1)))
 142                         goto einval;
 143 
 144                 /* Minimum number of keys per page; absolute minimum is 2. */
 145                 if (b.minkeypage) {
 146                         if (b.minkeypage < 2)
 147                                 goto einval;
 148                 } else
 149                         b.minkeypage = DEFMINKEYPAGE;
 150 
 151                 /* If no comparison, use default comparison and prefix. */
 152                 if (b.compare == NULL) {
 153                         b.compare = __bt_defcmp;
 154                         if (b.prefix == NULL)
 155                                 b.prefix = __bt_defpfx;
 156                 }
 157 
 158                 if (b.lorder == 0)
 159                         b.lorder = machine_lorder;
 160         } else {
 161                 b.compare = __bt_defcmp;
 162                 b.cachesize = 0;
 163                 b.flags = 0;
 164                 b.lorder = machine_lorder;
 165                 b.minkeypage = DEFMINKEYPAGE;
 166                 b.prefix = __bt_defpfx;
 167                 b.psize = 0;
 168         }
 169 
 170         /* Check for the ubiquitous PDP-11. */
 171         if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN)
 172                 goto einval;
 173 
 174         /* Allocate and initialize DB and BTREE structures. */
 175         if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL)
 176                 goto err;
 177         memset(t, 0, sizeof(BTREE));
 178         t->bt_fd = -1;                  /* Don't close unopened fd on error. */
 179         t->bt_lorder = b.lorder;
 180         t->bt_order = NOT;
 181         t->bt_cmp = b.compare;
 182         t->bt_pfx = b.prefix;
 183         t->bt_rfd = -1;
 184 
 185         if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL)
 186                 goto err;
 187         memset(t->bt_dbp, 0, sizeof(DB));
 188         if (t->bt_lorder != machine_lorder)
 189                 F_SET(t, B_NEEDSWAP);
 190 
 191         dbp->type = DB_BTREE;
 192         dbp->internal = t;
 193         dbp->close = __bt_close;
 194         dbp->del = __bt_delete;
 195         dbp->fd = __bt_fd;
 196         dbp->get = __bt_get;
 197         dbp->put = __bt_put;
 198         dbp->seq = __bt_seq;
 199         dbp->sync = __bt_sync;
 200 
 201         /*
 202          * If no file name was supplied, this is an in-memory btree and we
 203          * open a backing temporary file.  Otherwise, it's a disk-based tree.
 204          */
 205         if (fname) {
 206                 switch (flags & O_ACCMODE) {
 207                 case O_RDONLY:
 208                         F_SET(t, B_RDONLY);
 209                         break;
 210                 case O_RDWR:
 211                         break;
 212                 case O_WRONLY:
 213                 default:
 214                         goto einval;
 215                 }
 216                 
 217                 if ((t->bt_fd = open(fname, flags, mode)) < 0)
 218                         goto err;
 219 
 220         } else {
 221                 if ((flags & O_ACCMODE) != O_RDWR)
 222                         goto einval;
 223                 if ((t->bt_fd = tmp()) == -1)
 224                         goto err;
 225                 F_SET(t, B_INMEM);
 226         }
 227 
 228 #if !defined(_WIN32) && !defined(__DJGPP__)
 229         if (fcntl(t->bt_fd, F_SETFD, 1) == -1)
 230                 goto err;
 231 #endif
 232 
 233         if (fstat(t->bt_fd, &sb))
 234                 goto err;
 235         if (sb.st_size) {
 236                 if ((nr = read(t->bt_fd, &m, sizeof(BTMETA))) < 0)
 237                         goto err;
 238                 if (nr != sizeof(BTMETA))
 239                         goto eftype;
 240 
 241                 /*
 242                  * Read in the meta-data.  This can change the notion of what
 243                  * the lorder, page size and flags are, and, when the page size
 244                  * changes, the cachesize value can change too.  If the user
 245                  * specified the wrong byte order for an existing database, we
 246                  * don't bother to return an error, we just clear the NEEDSWAP
 247                  * bit.
 248                  */
 249                 if (m.magic == BTREEMAGIC)
 250                         F_CLR(t, B_NEEDSWAP);
 251                 else {
 252                         F_SET(t, B_NEEDSWAP);
 253                         M_32_SWAP(m.magic);
 254                         M_32_SWAP(m.version);
 255                         M_32_SWAP(m.psize);
 256                         M_32_SWAP(m.free);
 257                         M_32_SWAP(m.nrecs);
 258                         M_32_SWAP(m.flags);
 259                 }
 260                 if (m.magic != BTREEMAGIC || m.version != BTREEVERSION)
 261                         goto eftype;
 262                 if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 ||
 263                     m.psize & (sizeof(indx_t) - 1))
 264                         goto eftype;
 265                 if (m.flags & ~SAVEMETA)
 266                         goto eftype;
 267                 b.psize = m.psize;
 268                 F_SET(t, m.flags);
 269                 t->bt_free = m.free;
 270                 t->bt_nrecs = m.nrecs;
 271         } else {
 272                 /*
 273                  * Set the page size to the best value for I/O to this file.
 274                  * Don't overflow the page offset type.
 275                  */
 276                 if (b.psize == 0) {
 277 #ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
 278                         b.psize = sb.st_blksize;
 279                         if (b.psize < MINPSIZE)
 280                                 b.psize = MINPSIZE;
 281                         if (b.psize > MAX_PAGE_OFFSET + 1)
 282                                 b.psize = MAX_PAGE_OFFSET + 1;
 283 #else
 284                         b.psize = MINPSIZE;
 285 #endif
 286                 }
 287 
 288                 /* Set flag if duplicates permitted. */
 289                 if (!(b.flags & R_DUP))
 290                         F_SET(t, B_NODUPS);
 291 
 292                 t->bt_free = P_INVALID;
 293                 t->bt_nrecs = 0;
 294                 F_SET(t, B_METADIRTY);
 295         }
 296 
 297         t->bt_psize = b.psize;
 298 
 299         /* Set the cache size; must be a multiple of the page size. */
 300         if (b.cachesize && b.cachesize & (b.psize - 1))
 301                 b.cachesize += (~b.cachesize & (b.psize - 1)) + 1;
 302         if (b.cachesize < b.psize * MINCACHE)
 303                 b.cachesize = b.psize * MINCACHE;
 304 
 305         /* Calculate number of pages to cache. */
 306         ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize;
 307 
 308         /*
 309          * The btree data structure requires that at least two keys can fit on
 310          * a page, but other than that there's no fixed requirement.  The user
 311          * specified a minimum number per page, and we translated that into the
 312          * number of bytes a key/data pair can use before being placed on an
 313          * overflow page.  This calculation includes the page header, the size
 314          * of the index referencing the leaf item and the size of the leaf item
 315          * structure.  Also, don't let the user specify a minkeypage such that
 316          * a key/data pair won't fit even if both key and data are on overflow
 317          * pages.
 318          */
 319         t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage -
 320             (sizeof(indx_t) + NBLEAFDBT(0, 0));
 321         if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t))
 322                 t->bt_ovflsize =
 323                     NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t);
 324 
 325         /* Initialize the buffer pool. */
 326         if ((t->bt_mp =
 327             mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL)
 328                 goto err;
 329         if (!F_ISSET(t, B_INMEM))
 330                 mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t);
 331 
 332         /* Create a root page if new tree. */
 333         if (nroot(t) == RET_ERROR)
 334                 goto err;
 335 
 336         /* Global flags. */
 337         if (dflags & DB_LOCK)
 338                 F_SET(t, B_DB_LOCK);
 339         if (dflags & DB_SHMEM)
 340                 F_SET(t, B_DB_SHMEM);
 341         if (dflags & DB_TXN)
 342                 F_SET(t, B_DB_TXN);
 343 
 344         return (dbp);
 345 
 346 einval: errno = EINVAL;
 347         goto err;
 348 
 349 eftype: errno = EFTYPE;
 350         goto err;
 351 
 352 err:    if (t) {
 353                 if (t->bt_dbp)
 354                         free(t->bt_dbp);
 355                 if (t->bt_fd != -1)
 356                         (void)close(t->bt_fd);
 357                 free(t);
 358         }
 359         return (NULL);
 360 }
 361 
 362 /**
 363  * NROOT -- Create the root of a new tree.
 364  *
 365  *      @param t        tree
 366  *
 367  * @return #RET_ERROR, #RET_SUCCESS
 368  */
 369 static int
 370 nroot(t)
 371         BTREE *t;
 372 {
 373         PAGE *meta, *root;
 374         pgno_t npg;
 375 
 376         if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) {
 377                 mpool_put(t->bt_mp, meta, 0);
 378                 return (RET_SUCCESS);
 379         }
 380         if (errno != EINVAL)            /* It's OK to not exist. */
 381                 return (RET_ERROR);
 382         errno = 0;
 383 
 384         if ((meta = mpool_new(t->bt_mp, &npg)) == NULL)
 385                 return (RET_ERROR);
 386 
 387         if ((root = mpool_new(t->bt_mp, &npg)) == NULL)
 388                 return (RET_ERROR);
 389 
 390         if (npg != P_ROOT)
 391                 return (RET_ERROR);
 392         root->pgno = npg;
 393         root->prevpg = root->nextpg = P_INVALID;
 394         root->lower = BTDATAOFF;
 395         root->upper = t->bt_psize;
 396         root->flags = P_BLEAF;
 397         memset(meta, 0, t->bt_psize);
 398         mpool_put(t->bt_mp, meta, MPOOL_DIRTY);
 399         mpool_put(t->bt_mp, root, MPOOL_DIRTY);
 400         return (RET_SUCCESS);
 401 }
 402 
 403 static int
 404 tmp(void)
 405 {
 406         sigset_t set, oset;
 407         int fd;
 408         char *envtmp;
 409         char path[1024];
 410 
 411         envtmp = getenv("TMPDIR");
 412 #ifdef _WIN32
 413         if (envtmp == NULL)
 414                 envtmp = getenv("TMP");
 415 #endif
 416         if (envtmp && strlen(envtmp) + strlen("/bt.XXXXXX") >= sizeof(path))
 417                 return -1;
 418         (void)snprintf(path, sizeof(path),
 419                                 "%s/bt.XXXXXX", envtmp ? envtmp : "/tmp");
 420 
 421 #ifndef _WIN32
 422         (void)sigfillset(&set);
 423         (void)sigprocmask(SIG_BLOCK, &set, &oset);
 424 #endif
 425         if ((fd = mkstemp(path)) != -1)
 426                 (void)unlink(path);
 427 #ifndef _WIN32
 428         (void)sigprocmask(SIG_SETMASK, &oset, NULL);
 429 #endif
 430         return(fd);
 431 }
 432 
 433 static int
 434 byteorder(void)
 435 {
 436         u_int32_t x;
 437         u_char *p;
 438 
 439         x = 0x01020304;
 440         p = (u_char *)&x;
 441         switch (*p) {
 442         case 1:
 443                 return (BIG_ENDIAN);
 444         case 4:
 445                 return (LITTLE_ENDIAN);
 446         default:
 447                 return (0);
 448         }
 449 }
 450 
 451 int
 452 __bt_fd(dbp)
 453         const DB *dbp;
 454 {
 455         BTREE *t;
 456 
 457         t = dbp->internal;
 458 
 459         /* Toss any page pinned across calls. */
 460         if (t->bt_pinned != NULL) {
 461                 mpool_put(t->bt_mp, t->bt_pinned, 0);
 462                 t->bt_pinned = NULL;
 463         }
 464 
 465         /* In-memory database can't have a file descriptor. */
 466         if (F_ISSET(t, B_INMEM)) {
 467                 errno = ENOENT;
 468                 return (-1);
 469         }
 470         return (t->bt_fd);
 471 }

/* [previous][next][first][last][top][bottom][index][help] */