/* */
This source file includes following definitions.
- __bt_open
- nroot
- tmp
- byteorder
- __bt_fd
1 /*-
2 * Copyright (c) 1990, 1993, 1994
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Mike Olson.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32
33 #if defined(LIBC_SCCS) && !defined(lint)
34 static char sccsid[] = "@(#)bt_open.c 8.10 (Berkeley) 8/17/94";
35 #endif /* LIBC_SCCS and not lint */
36
37 /**
38 * @file
39 * Implementation of btree access method for 4.4BSD.
40 *
41 * The design here was originally based on that of the btree access method
42 * used in the Postgres database system at UC Berkeley. This implementation
43 * is wholly independent of the Postgres code.
44 */
45
46 #ifdef HAVE_CONFIG_H
47 #include <config.h>
48 #endif
49 #include <sys/stat.h>
50
51 #include <errno.h>
52 #include <fcntl.h>
53 #ifdef HAVE_LIMITS_H
54 #include <limits.h>
55 #endif
56 #include <signal.h>
57 #include <stdio.h>
58 #ifdef STDC_HEADERS
59 #include <stdlib.h>
60 #endif
61 #ifdef HAVE_STRING_H
62 #include <string.h>
63 #else
64 #include <strings.h>
65 #endif
66 #ifdef HAVE_UNISTD_H
67 #include <unistd.h>
68 #endif
69
70 #if (defined(_WIN32) && !defined(__CYGWIN__))
71 #include <share.h>
72 #define mkstemp(p) open(_mktemp(p), _O_CREAT | _O_RDWR | _O_BINARY | _O_TEMPORARY, _S_IWRITE | _SH_DENYRW)
73 #endif
74
75 #include "db.h"
76 #include "btree.h"
77
78 #ifdef DEBUG
79 #undef MINPSIZE
80 #define MINPSIZE 128
81 #endif
82
83 static int byteorder(void);
84 static int nroot(BTREE *);
85 static int tmp(void);
86
87 /**
88 * __BT_OPEN -- Open a btree.
89 *
90 * Creates and fills a #DB struct, and calls the routine that actually
91 * opens the btree.
92 *
93 * @param fname filename (@CODE{NULL} for in-memory trees)
94 * @param flags open flag bits
95 * @param mode open permission bits
96 * @param openinfo #BTREEINFO pointer
97 * @param dflags
98 *
99 * @return @CODE{NULL} on failure, pointer to #DB on success.
100 *
101 */
102 DB *
103 __bt_open(fname, flags, mode, openinfo, dflags)
104 const char *fname;
105 int flags, mode, dflags;
106 const BTREEINFO *openinfo;
107 {
108 struct stat sb;
109 BTMETA m;
110 BTREE *t;
111 BTREEINFO b;
112 DB *dbp;
113 pgno_t ncache;
114 ssize_t nr;
115 int machine_lorder;
116
117 t = NULL;
118
119 /*
120 * Intention is to make sure all of the user's selections are okay
121 * here and then use them without checking. Can't be complete, since
122 * we don't know the right page size, lorder or flags until the backing
123 * file is opened. Also, the file's page size can cause the cachesize
124 * to change.
125 */
126 machine_lorder = byteorder();
127 if (openinfo) {
128 b = *openinfo;
129
130 /* Flags: R_DUP. */
131 if (b.flags & ~(R_DUP))
132 goto einval;
133
134 /*
135 * Page size must be indx_t aligned and >= MINPSIZE. Default
136 * page size is set farther on, based on the underlying file
137 * transfer size.
138 */
139 if (b.psize &&
140 (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 ||
141 b.psize & (sizeof(indx_t) - 1)))
142 goto einval;
143
144 /* Minimum number of keys per page; absolute minimum is 2. */
145 if (b.minkeypage) {
146 if (b.minkeypage < 2)
147 goto einval;
148 } else
149 b.minkeypage = DEFMINKEYPAGE;
150
151 /* If no comparison, use default comparison and prefix. */
152 if (b.compare == NULL) {
153 b.compare = __bt_defcmp;
154 if (b.prefix == NULL)
155 b.prefix = __bt_defpfx;
156 }
157
158 if (b.lorder == 0)
159 b.lorder = machine_lorder;
160 } else {
161 b.compare = __bt_defcmp;
162 b.cachesize = 0;
163 b.flags = 0;
164 b.lorder = machine_lorder;
165 b.minkeypage = DEFMINKEYPAGE;
166 b.prefix = __bt_defpfx;
167 b.psize = 0;
168 }
169
170 /* Check for the ubiquitous PDP-11. */
171 if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN)
172 goto einval;
173
174 /* Allocate and initialize DB and BTREE structures. */
175 if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL)
176 goto err;
177 memset(t, 0, sizeof(BTREE));
178 t->bt_fd = -1; /* Don't close unopened fd on error. */
179 t->bt_lorder = b.lorder;
180 t->bt_order = NOT;
181 t->bt_cmp = b.compare;
182 t->bt_pfx = b.prefix;
183 t->bt_rfd = -1;
184
185 if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL)
186 goto err;
187 memset(t->bt_dbp, 0, sizeof(DB));
188 if (t->bt_lorder != machine_lorder)
189 F_SET(t, B_NEEDSWAP);
190
191 dbp->type = DB_BTREE;
192 dbp->internal = t;
193 dbp->close = __bt_close;
194 dbp->del = __bt_delete;
195 dbp->fd = __bt_fd;
196 dbp->get = __bt_get;
197 dbp->put = __bt_put;
198 dbp->seq = __bt_seq;
199 dbp->sync = __bt_sync;
200
201 /*
202 * If no file name was supplied, this is an in-memory btree and we
203 * open a backing temporary file. Otherwise, it's a disk-based tree.
204 */
205 if (fname) {
206 switch (flags & O_ACCMODE) {
207 case O_RDONLY:
208 F_SET(t, B_RDONLY);
209 break;
210 case O_RDWR:
211 break;
212 case O_WRONLY:
213 default:
214 goto einval;
215 }
216
217 if ((t->bt_fd = open(fname, flags, mode)) < 0)
218 goto err;
219
220 } else {
221 if ((flags & O_ACCMODE) != O_RDWR)
222 goto einval;
223 if ((t->bt_fd = tmp()) == -1)
224 goto err;
225 F_SET(t, B_INMEM);
226 }
227
228 #if !defined(_WIN32) && !defined(__DJGPP__)
229 if (fcntl(t->bt_fd, F_SETFD, 1) == -1)
230 goto err;
231 #endif
232
233 if (fstat(t->bt_fd, &sb))
234 goto err;
235 if (sb.st_size) {
236 if ((nr = read(t->bt_fd, &m, sizeof(BTMETA))) < 0)
237 goto err;
238 if (nr != sizeof(BTMETA))
239 goto eftype;
240
241 /*
242 * Read in the meta-data. This can change the notion of what
243 * the lorder, page size and flags are, and, when the page size
244 * changes, the cachesize value can change too. If the user
245 * specified the wrong byte order for an existing database, we
246 * don't bother to return an error, we just clear the NEEDSWAP
247 * bit.
248 */
249 if (m.magic == BTREEMAGIC)
250 F_CLR(t, B_NEEDSWAP);
251 else {
252 F_SET(t, B_NEEDSWAP);
253 M_32_SWAP(m.magic);
254 M_32_SWAP(m.version);
255 M_32_SWAP(m.psize);
256 M_32_SWAP(m.free);
257 M_32_SWAP(m.nrecs);
258 M_32_SWAP(m.flags);
259 }
260 if (m.magic != BTREEMAGIC || m.version != BTREEVERSION)
261 goto eftype;
262 if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 ||
263 m.psize & (sizeof(indx_t) - 1))
264 goto eftype;
265 if (m.flags & ~SAVEMETA)
266 goto eftype;
267 b.psize = m.psize;
268 F_SET(t, m.flags);
269 t->bt_free = m.free;
270 t->bt_nrecs = m.nrecs;
271 } else {
272 /*
273 * Set the page size to the best value for I/O to this file.
274 * Don't overflow the page offset type.
275 */
276 if (b.psize == 0) {
277 #ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
278 b.psize = sb.st_blksize;
279 if (b.psize < MINPSIZE)
280 b.psize = MINPSIZE;
281 if (b.psize > MAX_PAGE_OFFSET + 1)
282 b.psize = MAX_PAGE_OFFSET + 1;
283 #else
284 b.psize = MINPSIZE;
285 #endif
286 }
287
288 /* Set flag if duplicates permitted. */
289 if (!(b.flags & R_DUP))
290 F_SET(t, B_NODUPS);
291
292 t->bt_free = P_INVALID;
293 t->bt_nrecs = 0;
294 F_SET(t, B_METADIRTY);
295 }
296
297 t->bt_psize = b.psize;
298
299 /* Set the cache size; must be a multiple of the page size. */
300 if (b.cachesize && b.cachesize & (b.psize - 1))
301 b.cachesize += (~b.cachesize & (b.psize - 1)) + 1;
302 if (b.cachesize < b.psize * MINCACHE)
303 b.cachesize = b.psize * MINCACHE;
304
305 /* Calculate number of pages to cache. */
306 ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize;
307
308 /*
309 * The btree data structure requires that at least two keys can fit on
310 * a page, but other than that there's no fixed requirement. The user
311 * specified a minimum number per page, and we translated that into the
312 * number of bytes a key/data pair can use before being placed on an
313 * overflow page. This calculation includes the page header, the size
314 * of the index referencing the leaf item and the size of the leaf item
315 * structure. Also, don't let the user specify a minkeypage such that
316 * a key/data pair won't fit even if both key and data are on overflow
317 * pages.
318 */
319 t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage -
320 (sizeof(indx_t) + NBLEAFDBT(0, 0));
321 if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t))
322 t->bt_ovflsize =
323 NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t);
324
325 /* Initialize the buffer pool. */
326 if ((t->bt_mp =
327 mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL)
328 goto err;
329 if (!F_ISSET(t, B_INMEM))
330 mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t);
331
332 /* Create a root page if new tree. */
333 if (nroot(t) == RET_ERROR)
334 goto err;
335
336 /* Global flags. */
337 if (dflags & DB_LOCK)
338 F_SET(t, B_DB_LOCK);
339 if (dflags & DB_SHMEM)
340 F_SET(t, B_DB_SHMEM);
341 if (dflags & DB_TXN)
342 F_SET(t, B_DB_TXN);
343
344 return (dbp);
345
346 einval: errno = EINVAL;
347 goto err;
348
349 eftype: errno = EFTYPE;
350 goto err;
351
352 err: if (t) {
353 if (t->bt_dbp)
354 free(t->bt_dbp);
355 if (t->bt_fd != -1)
356 (void)close(t->bt_fd);
357 free(t);
358 }
359 return (NULL);
360 }
361
362 /**
363 * NROOT -- Create the root of a new tree.
364 *
365 * @param t tree
366 *
367 * @return #RET_ERROR, #RET_SUCCESS
368 */
369 static int
370 nroot(t)
371 BTREE *t;
372 {
373 PAGE *meta, *root;
374 pgno_t npg;
375
376 if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) {
377 mpool_put(t->bt_mp, meta, 0);
378 return (RET_SUCCESS);
379 }
380 if (errno != EINVAL) /* It's OK to not exist. */
381 return (RET_ERROR);
382 errno = 0;
383
384 if ((meta = mpool_new(t->bt_mp, &npg)) == NULL)
385 return (RET_ERROR);
386
387 if ((root = mpool_new(t->bt_mp, &npg)) == NULL)
388 return (RET_ERROR);
389
390 if (npg != P_ROOT)
391 return (RET_ERROR);
392 root->pgno = npg;
393 root->prevpg = root->nextpg = P_INVALID;
394 root->lower = BTDATAOFF;
395 root->upper = t->bt_psize;
396 root->flags = P_BLEAF;
397 memset(meta, 0, t->bt_psize);
398 mpool_put(t->bt_mp, meta, MPOOL_DIRTY);
399 mpool_put(t->bt_mp, root, MPOOL_DIRTY);
400 return (RET_SUCCESS);
401 }
402
403 static int
404 tmp(void)
405 {
406 sigset_t set, oset;
407 int fd;
408 char *envtmp;
409 char path[1024];
410
411 envtmp = getenv("TMPDIR");
412 #ifdef _WIN32
413 if (envtmp == NULL)
414 envtmp = getenv("TMP");
415 #endif
416 if (envtmp && strlen(envtmp) + strlen("/bt.XXXXXX") >= sizeof(path))
417 return -1;
418 (void)snprintf(path, sizeof(path),
419 "%s/bt.XXXXXX", envtmp ? envtmp : "/tmp");
420
421 #ifndef _WIN32
422 (void)sigfillset(&set);
423 (void)sigprocmask(SIG_BLOCK, &set, &oset);
424 #endif
425 if ((fd = mkstemp(path)) != -1)
426 (void)unlink(path);
427 #ifndef _WIN32
428 (void)sigprocmask(SIG_SETMASK, &oset, NULL);
429 #endif
430 return(fd);
431 }
432
433 static int
434 byteorder(void)
435 {
436 u_int32_t x;
437 u_char *p;
438
439 x = 0x01020304;
440 p = (u_char *)&x;
441 switch (*p) {
442 case 1:
443 return (BIG_ENDIAN);
444 case 4:
445 return (LITTLE_ENDIAN);
446 default:
447 return (0);
448 }
449 }
450
451 int
452 __bt_fd(dbp)
453 const DB *dbp;
454 {
455 BTREE *t;
456
457 t = dbp->internal;
458
459 /* Toss any page pinned across calls. */
460 if (t->bt_pinned != NULL) {
461 mpool_put(t->bt_mp, t->bt_pinned, 0);
462 t->bt_pinned = NULL;
463 }
464
465 /* In-memory database can't have a file descriptor. */
466 if (F_ISSET(t, B_INMEM)) {
467 errno = ENOENT;
468 return (-1);
469 }
470 return (t->bt_fd);
471 }
/* */