/* */
1 /*-
2 * Copyright (c) 1991, 1993, 1994
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Mike Olson.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)btree.h 8.11 (Berkeley) 8/17/94
33 */
34
35 /** @name Macros to set/clear/test flags. */
36 /** @{ */
37 #define F_SET(p, f) (p)->flags |= (f)
38 #define F_CLR(p, f) (p)->flags &= ~(f)
39 #define F_ISSET(p, f) ((p)->flags & (f))
40 /** @} */
41
42 #include "mpool.h"
43
44 /** Minimum keys per page */
45 #define DEFMINKEYPAGE (2)
46
47 /** Minimum cached pages */
48 #define MINCACHE (5)
49
50 /** Minimum page size */
51 #define MINPSIZE (512)
52
53 /** @file
54 * @details
55 * Page 0 of a btree file contains a copy of the meta-data. This page is also
56 * used as an out-of-band page, i.e. page pointers that point to nowhere point
57 * to page 0. Page 1 is the root of the btree.
58 */
59 /** Invalid tree page number. */
60 #define P_INVALID 0
61 /** Tree metadata page number. */
62 #define P_META 0
63 /** Tree root page number. */
64 #define P_ROOT 1
65
66 /**
67 * There are five page layouts in the btree: btree internal pages (#BINTERNAL),
68 * btree leaf pages (#BLEAF), recno internal pages (#RINTERNAL), recno leaf pages
69 * (#RLEAF) and overflow pages. All five page types have a page header (#PAGE). <br>
70 * @STRONG{This implementation requires that values within structures NOT be padded}.
71 * (ANSI C permits random padding.) If your compiler pads randomly you'll have
72 * to do some work to get this package to run.
73 */
74 typedef struct _page {
75 pgno_t pgno; /**< this page's page number */
76 pgno_t prevpg; /**< left sibling */
77 pgno_t nextpg; /**< right sibling */
78
79 /** btree internal page */
80 #define P_BINTERNAL 0x01
81 /** leaf page */
82 #define P_BLEAF 0x02
83 /** overflow page */
84 #define P_OVERFLOW 0x04
85 /** recno internal page */
86 #define P_RINTERNAL 0x08
87 /** leaf page */
88 #define P_RLEAF 0x10
89 /** type mask */
90 #define P_TYPE 0x1f
91 /** never delete this chain of pages */
92 #define P_PRESERVE 0x20
93 u_int32_t flags;
94
95 indx_t lower; /**< lower bound of free space on page */
96 indx_t upper; /**< upper bound of free space on page */
97 indx_t linp[1]; /**< #indx_t-aligned VAR. LENGTH DATA */
98 } PAGE;
99
100 /** First and next index. */
101 #define BTDATAOFF \
102 (sizeof(pgno_t) + sizeof(pgno_t) + sizeof(pgno_t) + \
103 sizeof(u_int32_t) + sizeof(indx_t) + sizeof(indx_t))
104 #define NEXTINDEX(p) (((p)->lower - BTDATAOFF) / sizeof(indx_t))
105
106 /**
107 * For pages other than overflow pages, there is an array of offsets into the
108 * rest of the page immediately following the page header. Each offset is to
109 * an item which is unique to the type of page. The @NAME{h_lower} offset is just
110 * past the last filled-in index. The @NAME{h_upper} offset is the first item on the
111 * page. Offsets are from the beginning of the page.
112 *
113 * If an item is too big to store on a single page, a flag is set and the item
114 * is a {@NAME{page}, @NAME{size}} pair such that the page is the first page of an overflow
115 * chain with size bytes of item. Overflow pages are simply bytes without any
116 * external structure.
117 *
118 * The page number and size fields in the items are #pgno_t-aligned so they can
119 * be manipulated without copying. (This presumes that 32 bit items can be
120 * manipulated on this system.)
121 */
122 #define LALIGN(n) (((n) + sizeof(pgno_t) - 1) & ~(sizeof(pgno_t) - 1))
123 #define NOVFLSIZE (sizeof(pgno_t) + sizeof(u_int32_t))
124
125 /**
126 * For the btree internal pages, the item is a key. @VAR{BINTERNAL}s are {@NAME{key}, @NAME{pgno}}
127 * pairs, such that the key compares less than or equal to all of the records
128 * on that page. For a tree without duplicate keys, an internal page with two
129 * consecutive keys, @EMPH{a} and @EMPH{b}, will have all records greater than or equal to @EMPH{a}
130 * and less than @EMPH{b} stored on the page associated with @EMPH{a}. Duplicate keys are
131 * somewhat special and can cause duplicate internal and leaf page records and
132 * some minor modifications of the above rule.
133 */
134 typedef struct _binternal {
135 u_int32_t ksize; /**< key size */
136 pgno_t pgno; /**< page number stored on */
137
138 /** overflow data */
139 #define P_BIGDATA 0x01
140 /** overflow key */
141 #define P_BIGKEY 0x02
142 u_char flags;
143 char bytes[1]; /**< data */
144 } BINTERNAL;
145
146 /** Get the page's #BINTERNAL structure at index indx. */
147 #define GETBINTERNAL(pg, indx) \
148 ((BINTERNAL *)((char *)(pg) + (pg)->linp[indx]))
149
150 /** Get the number of bytes in the entry. */
151 #define NBINTERNAL(len) \
152 LALIGN(sizeof(u_int32_t) + sizeof(pgno_t) + sizeof(u_char) + (len))
153
154 /** Copy a #BINTERNAL entry to the page. */
155 #define WR_BINTERNAL(p, size, pgno, flags) { \
156 *(u_int32_t *)p = size; \
157 p += sizeof(u_int32_t); \
158 *(pgno_t *)p = pgno; \
159 p += sizeof(pgno_t); \
160 *(u_char *)p = flags; \
161 p += sizeof(u_char); \
162 }
163
164 /**
165 * For the recno internal pages, the item is a page number with the number of
166 * keys found on that page and below.
167 */
168 typedef struct _rinternal {
169 recno_t nrecs; /**< number of records */
170 pgno_t pgno; /**< page number stored below */
171 } RINTERNAL;
172
173 /** Get the page's #RINTERNAL structure at index indx. */
174 #define GETRINTERNAL(pg, indx) \
175 ((RINTERNAL *)((char *)(pg) + (pg)->linp[indx]))
176
177 /** Get the number of bytes in the entry. */
178 #define NRINTERNAL \
179 LALIGN(sizeof(recno_t) + sizeof(pgno_t))
180
181 /** Copy a #RINTERNAL entry to the page. */
182 #define WR_RINTERNAL(p, nrecs, pgno) { \
183 *(recno_t *)p = nrecs; \
184 p += sizeof(recno_t); \
185 *(pgno_t *)p = pgno; \
186 }
187
188 /** For the btree leaf pages, the item is a key and data pair. */
189 typedef struct _bleaf {
190 u_int32_t ksize; /**< size of key */
191 u_int32_t dsize; /**< size of data */
192 u_char flags; /**< #P_BIGDATA, #P_BIGKEY */
193 char bytes[1]; /**< data */
194 } BLEAF;
195
196 /** Get the page's #BLEAF structure at index indx. */
197 #define GETBLEAF(pg, indx) \
198 ((BLEAF *)((char *)(pg) + (pg)->linp[indx]))
199
200 /** Get the number of bytes in the entry. */
201 #define NBLEAF(p) NBLEAFDBT((p)->ksize, (p)->dsize)
202
203 /** Get the number of bytes in the user's key/data pair. */
204 #define NBLEAFDBT(ksize, dsize) \
205 LALIGN(sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_char) + \
206 (ksize) + (dsize))
207
208 /** Copy a #BLEAF entry to the page. */
209 #define WR_BLEAF(p, key, data, flags) { \
210 *(u_int32_t *)p = key->size; \
211 p += sizeof(u_int32_t); \
212 *(u_int32_t *)p = data->size; \
213 p += sizeof(u_int32_t); \
214 *(u_char *)p = flags; \
215 p += sizeof(u_char); \
216 memmove(p, key->data, key->size); \
217 p += key->size; \
218 memmove(p, data->data, data->size); \
219 }
220
221 /** For the recno leaf pages, the item is a data entry. */
222 typedef struct _rleaf {
223 u_int32_t dsize; /**< size of data */
224 u_char flags; /**< #P_BIGDATA */
225 char bytes[1];
226 } RLEAF;
227
228 /** Get the page's #RLEAF structure at index indx. */
229 #define GETRLEAF(pg, indx) \
230 ((RLEAF *)((char *)(pg) + (pg)->linp[indx]))
231
232 /** Get the number of bytes in the entry. */
233 #define NRLEAF(p) NRLEAFDBT((p)->dsize)
234
235 /** Get the number of bytes from the user's data. */
236 #define NRLEAFDBT(dsize) \
237 LALIGN(sizeof(u_int32_t) + sizeof(u_char) + (dsize))
238
239 /** Copy a #RLEAF entry to the page. */
240 #define WR_RLEAF(p, data, flags) { \
241 *(u_int32_t *)p = data->size; \
242 p += sizeof(u_int32_t); \
243 *(u_char *)p = flags; \
244 p += sizeof(u_char); \
245 memmove(p, data->data, data->size); \
246 }
247
248 /**
249 * A record in the tree is either a pointer to a page and an index in the page
250 * or a page number and an index. These structures are used as a cursor, stack
251 * entry and search returns as well as to pass records to other routines.
252 *
253 * One comment about searches. Internal page searches must find the largest
254 * record less than key in the tree so that descents work. Leaf page searches
255 * must find the smallest record greater than key so that the returned index
256 * is the record's correct position for insertion.
257 */
258 typedef struct _epgno {
259 pgno_t pgno; /**< the page number */
260 indx_t index; /**< the index on the page */
261 } EPGNO;
262
263 typedef struct _epg {
264 PAGE *page; /**< the (pinned) page */
265 indx_t index; /**< the index on the page */
266 } EPG;
267
268 /**
269 * About cursors. The cursor (and the page that contained the key/data pair
270 * that it referenced) can be deleted, which makes things a bit tricky. If
271 * there are no duplicates of the cursor key in the tree (i.e. #B_NODUPS is set
272 * or there simply aren't any duplicates of the key) we copy the key that it
273 * referenced when it's deleted, and reacquire a new cursor key if the cursor
274 * is used again. If there are duplicates keys, we move to the next/previous
275 * key, and set a flag so that we know what happened. @b NOTE: if duplicate (to
276 * the cursor) keys are added to the tree during this process, it is undefined
277 * if they will be returned or not in a cursor scan.
278 *
279 * The flags determine the possible states of the cursor:
280 *
281 * @par CURS_INIT
282 * The cursor references @STRONG{*something*}.
283 * @par CURS_ACQUIRE
284 * The cursor was deleted, and a key has been saved so that
285 * we can reacquire the right position in the tree.
286 * @par CURS_AFTER, CURS_BEFORE
287 * The cursor was deleted, and now references a key/data pair
288 * that has not yet been returned, either before or after the
289 * deleted key/data pair.
290 * @par XXX
291 * This structure is broken out so that we can eventually offer multiple
292 * cursors as part of the #DB interface.
293 */
294 typedef struct _cursor {
295 EPGNO pg; /**< B: Saved tree reference. */
296 DBT key; /**< B: Saved key, or @CODE{key.data == NULL}. */
297 recno_t rcursor; /**< R: recno cursor (1-based) */
298
299 /** B: Cursor needs to be reacquired. */
300 #define CURS_ACQUIRE 0x01
301 /** B: Unreturned cursor after key. */
302 #define CURS_AFTER 0x02
303 /** B: Unreturned cursor before key. */
304 #define CURS_BEFORE 0x04
305 /** RB: Cursor initialized. */
306 #define CURS_INIT 0x08
307 u_int8_t flags;
308 } CURSOR;
309
310 /**
311 * The metadata of the tree. The @link BTMETA::nrecs nrecs @endlink field is used only by the RECNO code.
312 * This is because the btree doesn't really need it and it requires that every
313 * put or delete call modify the metadata.
314 */
315 typedef struct _btmeta {
316 u_int32_t magic; /**< magic number */
317 u_int32_t version; /**< version */
318 u_int32_t psize; /**< page size */
319 u_int32_t free; /**< page number of first free page */
320 u_int32_t nrecs; /**< R: number of records */
321
322 #define SAVEMETA (B_NODUPS | R_RECNO)
323 u_int32_t flags; /**< bt_flags \& #SAVEMETA */
324 } BTMETA;
325
326 /** The in-memory btree/recno data structure. */
327 typedef struct _btree {
328 MPOOL *bt_mp; /**< memory pool cookie */
329
330 DB *bt_dbp; /**< pointer to enclosing #DB */
331
332 EPG bt_cur; /**< current (pinned) page */
333 PAGE *bt_pinned; /**< page pinned across calls */
334
335 CURSOR bt_cursor; /**< cursor */
336
337 #define BT_PUSH(t, p, i) { \
338 t->bt_sp->pgno = p; \
339 t->bt_sp->index = i; \
340 ++t->bt_sp; \
341 }
342 #define BT_POP(t) (t->bt_sp == t->bt_stack ? NULL : --t->bt_sp)
343 #define BT_CLR(t) (t->bt_sp = t->bt_stack)
344 EPGNO bt_stack[50]; /**< stack of parent pages */
345 EPGNO *bt_sp; /**< current stack pointer */
346
347 DBT bt_rkey; /**< returned key */
348 DBT bt_rdata; /**< returned data */
349
350 int bt_fd; /**< tree file descriptor */
351
352 pgno_t bt_free; /**< next free page */
353 u_int32_t bt_psize; /**< page size */
354 indx_t bt_ovflsize; /**< cut-off for key/data overflow */
355 int bt_lorder; /**< byte order */
356 /** sorted order */
357 enum { NOT, BACK, FORWARD } bt_order;
358 EPGNO bt_last; /**< last insert */
359
360 /** B: key comparison function */
361 int (*bt_cmp)(const DBT *, const DBT *);
362 /** B: prefix comparison function */
363 size_t (*bt_pfx)(const DBT *, const DBT *);
364 /** R: recno input function */
365 int (*bt_irec)(struct _btree *, recno_t);
366
367 FILE *bt_rfp; /**< R: record @VAR{FILE} pointer */
368 int bt_rfd; /**< R: record file descriptor */
369
370 caddr_t bt_cmap; /**< R: current point in mapped space */
371 caddr_t bt_smap; /**< R: start of mapped space */
372 caddr_t bt_emap; /**< R: end of mapped space */
373 size_t bt_msize; /**< R: size of mapped region. */
374
375 recno_t bt_nrecs; /**< R: number of records */
376 size_t bt_reclen; /**< R: fixed record length */
377 u_char bt_bval; /**< R: delimiting byte/pad character */
378
379 /*
380 * NB:
381 * B_NODUPS and R_RECNO are stored on disk, and may not be changed.
382 */
383 /** in-memory tree */
384 #define B_INMEM 0x00001
385 /** need to write metadata */
386 #define B_METADIRTY 0x00002
387 /** tree modified */
388 #define B_MODIFIED 0x00004
389 /** if byte order requires swapping */
390 #define B_NEEDSWAP 0x00008
391 /** read-only tree */
392 #define B_RDONLY 0x00010
393
394 /** no duplicate keys permitted.
395 @note #B_NODUPS is stored on disk, and may not be changed. */
396 #define B_NODUPS 0x00020
397
398 /** record oriented tree.
399 @note #R_RECNO is stored on disk, and may not be changed. */
400 #define R_RECNO 0x00080
401
402 /** opened a file pointer */
403 #define R_CLOSEFP 0x00040
404 /** end of input file reached. */
405 #define R_EOF 0x00100
406 /** fixed length records */
407 #define R_FIXLEN 0x00200
408 /** memory mapped file. */
409 #define R_MEMMAPPED 0x00400
410 /** in-memory file */
411 #define R_INMEM 0x00800
412 /** modified file */
413 #define R_MODIFIED 0x01000
414 /** read-only file */
415 #define R_RDONLY 0x02000
416
417 /** #DB_LOCK specified. */
418 #define B_DB_LOCK 0x04000
419 /** #DB_SHMEM specified. */
420 #define B_DB_SHMEM 0x08000
421 /** #DB_TXN specified. */
422 #define B_DB_TXN 0x10000
423 u_int32_t flags;
424 } BTREE;
425
426 #include "extern.h"
/* */