/* */
This source file includes following definitions.
- __bt_split
- bt_page
- bt_root
- bt_rroot
- bt_broot
- bt_psplit
- bt_preserve
- rec_total
1 /*-
2 * Copyright (c) 1990, 1993, 1994
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Mike Olson.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32
33 #if defined(LIBC_SCCS) && !defined(lint)
34 static char sccsid[] = "@(#)bt_split.c 8.9 (Berkeley) 7/26/94";
35 #endif /* LIBC_SCCS and not lint */
36
37 #ifdef HAVE_CONFIG_H
38 #include <config.h>
39 #endif
40 #include <sys/types.h>
41 #ifdef HAVE_LIMITS_H
42 #include <limits.h>
43 #endif
44 #include <stdio.h>
45 #ifdef STDC_HEADERS
46 #include <stdlib.h>
47 #endif
48 #ifdef HAVE_STRING_H
49 #include <string.h>
50 #else
51 #include <strings.h>
52 #endif
53
54 #include "db.h"
55 #include "btree.h"
56
57 static int bt_broot(BTREE *, PAGE *, PAGE *, PAGE *);
58 static PAGE *bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
59 static int bt_preserve(BTREE *, pgno_t);
60 static PAGE *bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t);
61 static PAGE *bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
62 static int bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *);
63 static recno_t rec_total(PAGE *);
64
65 u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
66
67 /**
68 * __BT_SPLIT -- Split the tree.
69 *
70 * @param t tree
71 * @param sp page to split
72 * @param key key to insert
73 * @param data data to insert
74 * @param flags #P_BIGKEY/#P_BIGDATA flags
75 * @param ilen insert length
76 * @param argskip index to leave open
77 *
78 * @return
79 * #RET_ERROR, #RET_SUCCESS
80 */
81 int
82 __bt_split(t, sp, key, data, flags, ilen, argskip)
83 BTREE *t;
84 PAGE *sp;
85 const DBT *key, *data;
86 int flags;
87 size_t ilen;
88 u_int32_t argskip;
89 {
90 BINTERNAL *bi = NULL;
91 BLEAF *bl = NULL, *tbl;
92 DBT a, b;
93 EPGNO *parent;
94 PAGE *h, *l, *r, *lchild, *rchild;
95 indx_t nxtindex;
96 u_int16_t skip;
97 u_int32_t n, nbytes, nksize = 0;
98 int parentsplit;
99 char *dest;
100
101 /*
102 * Split the page into two pages, l and r. The split routines return
103 * a pointer to the page into which the key should be inserted and with
104 * skip set to the offset which should be used. Additionally, l and r
105 * are pinned.
106 */
107 skip = argskip;
108 h = sp->pgno == P_ROOT ?
109 bt_root(t, sp, &l, &r, &skip, ilen) :
110 bt_page(t, sp, &l, &r, &skip, ilen);
111 if (h == NULL)
112 return (RET_ERROR);
113
114 /*
115 * Insert the new key/data pair into the leaf page. (Key inserts
116 * always cause a leaf page to split first.)
117 */
118 h->linp[skip] = h->upper -= ilen;
119 dest = (char *)h + h->upper;
120 if (F_ISSET(t, R_RECNO))
121 WR_RLEAF(dest, data, flags)
122 else
123 WR_BLEAF(dest, key, data, flags)
124
125 /* If the root page was split, make it look right. */
126 if (sp->pgno == P_ROOT &&
127 (F_ISSET(t, R_RECNO) ?
128 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
129 goto err2;
130
131 /*
132 * Now we walk the parent page stack -- a LIFO stack of the pages that
133 * were traversed when we searched for the page that split. Each stack
134 * entry is a page number and a page index offset. The offset is for
135 * the page traversed on the search. We've just split a page, so we
136 * have to insert a new key into the parent page.
137 *
138 * If the insert into the parent page causes it to split, may have to
139 * continue splitting all the way up the tree. We stop if the root
140 * splits or the page inserted into didn't have to split to hold the
141 * new key. Some algorithms replace the key for the old page as well
142 * as the new page. We don't, as there's no reason to believe that the
143 * first key on the old page is any better than the key we have, and,
144 * in the case of a key being placed at index 0 causing the split, the
145 * key is unavailable.
146 *
147 * There are a maximum of 5 pages pinned at any time. We keep the left
148 * and right pages pinned while working on the parent. The 5 are the
149 * two children, left parent and right parent (when the parent splits)
150 * and the root page or the overflow key page when calling bt_preserve.
151 * This code must make sure that all pins are released other than the
152 * root page or overflow page which is unlocked elsewhere.
153 */
154 while ((parent = BT_POP(t)) != NULL) {
155 lchild = l;
156 rchild = r;
157
158 /* Get the parent page. */
159 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
160 goto err2;
161
162 /*
163 * The new key goes ONE AFTER the index, because the split
164 * was to the right.
165 */
166 skip = parent->index + 1;
167
168 /*
169 * Calculate the space needed on the parent page.
170 *
171 * Prefix trees: space hack when inserting into BINTERNAL
172 * pages. Retain only what's needed to distinguish between
173 * the new entry and the LAST entry on the page to its left.
174 * If the keys compare equal, retain the entire key. Note,
175 * we don't touch overflow keys, and the entire key must be
176 * retained for the next-to-left most key on the leftmost
177 * page of each level, or the search will fail. Applicable
178 * ONLY to internal pages that have leaf pages as children.
179 * Further reduction of the key between pairs of internal
180 * pages loses too much information.
181 */
182 switch (rchild->flags & P_TYPE) {
183 case P_BINTERNAL:
184 bi = GETBINTERNAL(rchild, 0);
185 nbytes = NBINTERNAL(bi->ksize);
186 break;
187 case P_BLEAF:
188 bl = GETBLEAF(rchild, 0);
189 nbytes = NBINTERNAL(bl->ksize);
190 if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
191 (h->prevpg != P_INVALID || skip > 1)) {
192 tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
193 a.size = tbl->ksize;
194 a.data = tbl->bytes;
195 b.size = bl->ksize;
196 b.data = bl->bytes;
197 nksize = t->bt_pfx(&a, &b);
198 n = NBINTERNAL(nksize);
199 if (n < nbytes) {
200 #ifdef STATISTICS
201 bt_pfxsaved += nbytes - n;
202 #endif
203 nbytes = n;
204 } else
205 nksize = 0;
206 } else
207 nksize = 0;
208 break;
209 case P_RINTERNAL:
210 case P_RLEAF:
211 nbytes = NRINTERNAL;
212 break;
213 default:
214 abort();
215 }
216
217 /* Split the parent page if necessary or shift the indices. */
218 if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
219 sp = h;
220 h = h->pgno == P_ROOT ?
221 bt_root(t, h, &l, &r, &skip, nbytes) :
222 bt_page(t, h, &l, &r, &skip, nbytes);
223 if (h == NULL)
224 goto err1;
225 parentsplit = 1;
226 } else {
227 if (skip < (nxtindex = NEXTINDEX(h)))
228 memmove(h->linp + skip + 1, h->linp + skip,
229 (nxtindex - skip) * sizeof(indx_t));
230 h->lower += sizeof(indx_t);
231 parentsplit = 0;
232 }
233
234 /* Insert the key into the parent page. */
235 switch (rchild->flags & P_TYPE) {
236 case P_BINTERNAL:
237 h->linp[skip] = h->upper -= nbytes;
238 dest = (char *)h + h->linp[skip];
239 memmove(dest, bi, nbytes);
240 ((BINTERNAL *)dest)->pgno = rchild->pgno;
241 break;
242 case P_BLEAF:
243 h->linp[skip] = h->upper -= nbytes;
244 dest = (char *)h + h->linp[skip];
245 WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
246 rchild->pgno, bl->flags & P_BIGKEY);
247 memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
248 if (bl->flags & P_BIGKEY &&
249 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
250 goto err1;
251 break;
252 case P_RINTERNAL:
253 /*
254 * Update the left page count. If split
255 * added at index 0, fix the correct page.
256 */
257 if (skip > 0)
258 dest = (char *)h + h->linp[skip - 1];
259 else
260 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
261 ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
262 ((RINTERNAL *)dest)->pgno = lchild->pgno;
263
264 /* Update the right page count. */
265 h->linp[skip] = h->upper -= nbytes;
266 dest = (char *)h + h->linp[skip];
267 ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
268 ((RINTERNAL *)dest)->pgno = rchild->pgno;
269 break;
270 case P_RLEAF:
271 /*
272 * Update the left page count. If split
273 * added at index 0, fix the correct page.
274 */
275 if (skip > 0)
276 dest = (char *)h + h->linp[skip - 1];
277 else
278 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
279 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
280 ((RINTERNAL *)dest)->pgno = lchild->pgno;
281
282 /* Update the right page count. */
283 h->linp[skip] = h->upper -= nbytes;
284 dest = (char *)h + h->linp[skip];
285 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
286 ((RINTERNAL *)dest)->pgno = rchild->pgno;
287 break;
288 default:
289 abort();
290 }
291
292 /* Unpin the held pages. */
293 if (!parentsplit) {
294 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
295 break;
296 }
297
298 /* If the root page was split, make it look right. */
299 if (sp->pgno == P_ROOT &&
300 (F_ISSET(t, R_RECNO) ?
301 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
302 goto err1;
303
304 mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
305 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
306 }
307
308 /* Unpin the held pages. */
309 mpool_put(t->bt_mp, l, MPOOL_DIRTY);
310 mpool_put(t->bt_mp, r, MPOOL_DIRTY);
311
312 /* Clear any pages left on the stack. */
313 return (RET_SUCCESS);
314
315 /*
316 * If something fails in the above loop we were already walking back
317 * up the tree and the tree is now inconsistent. Nothing much we can
318 * do about it but release any memory we're holding.
319 */
320 err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
321 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
322
323 err2: mpool_put(t->bt_mp, l, 0);
324 mpool_put(t->bt_mp, r, 0);
325 __dbpanic(t->bt_dbp);
326 return (RET_ERROR);
327 }
328
329 /**
330 * BT_PAGE -- Split a non-root page of a btree.
331 *
332 * @param t tree
333 * @param h root page
334 * @param lp pointer to left page pointer
335 * @param rp pointer to right page pointer
336 * @param skip pointer to index to leave open
337 * @param ilen insert length
338 *
339 * @return
340 * Pointer to page in which to insert or @CODE{NULL} on error.
341 */
342 static PAGE *
343 bt_page(t, h, lp, rp, skip, ilen)
344 BTREE *t;
345 PAGE *h, **lp, **rp;
346 indx_t *skip;
347 size_t ilen;
348 {
349 PAGE *l, *r, *tp;
350 pgno_t npg;
351
352 #ifdef STATISTICS
353 ++bt_split;
354 #endif
355 /* Put the new right page for the split into place. */
356 if ((r = __bt_new(t, &npg)) == NULL)
357 return (NULL);
358 r->pgno = npg;
359 r->lower = BTDATAOFF;
360 r->upper = t->bt_psize;
361 r->nextpg = h->nextpg;
362 r->prevpg = h->pgno;
363 r->flags = h->flags & P_TYPE;
364
365 /*
366 * If we're splitting the last page on a level because we're appending
367 * a key to it (skip is NEXTINDEX()), it's likely that the data is
368 * sorted. Adding an empty page on the side of the level is less work
369 * and can push the fill factor much higher than normal. If we're
370 * wrong it's no big deal, we'll just do the split the right way next
371 * time. It may look like it's equally easy to do a similar hack for
372 * reverse sorted data, that is, split the tree left, but it's not.
373 * Don't even try.
374 */
375 if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
376 #ifdef STATISTICS
377 ++bt_sortsplit;
378 #endif
379 h->nextpg = r->pgno;
380 r->lower = BTDATAOFF + sizeof(indx_t);
381 *skip = 0;
382 *lp = h;
383 *rp = r;
384 return (r);
385 }
386
387 /* Put the new left page for the split into place. */
388 if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) {
389 mpool_put(t->bt_mp, r, 0);
390 return (NULL);
391 }
392 #ifdef PURIFY
393 memset(l, 0xff, t->bt_psize);
394 #endif
395 l->pgno = h->pgno;
396 l->nextpg = r->pgno;
397 l->prevpg = h->prevpg;
398 l->lower = BTDATAOFF;
399 l->upper = t->bt_psize;
400 l->flags = h->flags & P_TYPE;
401
402 /* Fix up the previous pointer of the page after the split page. */
403 if (h->nextpg != P_INVALID) {
404 if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
405 free(l);
406 /* XXX mpool_free(t->bt_mp, r->pgno); */
407 return (NULL);
408 }
409 tp->prevpg = r->pgno;
410 mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
411 }
412
413 /*
414 * Split right. The key/data pairs aren't sorted in the btree page so
415 * it's simpler to copy the data from the split page onto two new pages
416 * instead of copying half the data to the right page and compacting
417 * the left page in place. Since the left page can't change, we have
418 * to swap the original and the allocated left page after the split.
419 */
420 tp = bt_psplit(t, h, l, r, skip, ilen);
421
422 /* Move the new left page onto the old left page. */
423 memmove(h, l, t->bt_psize);
424 if (tp == l)
425 tp = h;
426 free(l);
427
428 *lp = h;
429 *rp = r;
430 return (tp);
431 }
432
433 /**
434 * BT_ROOT -- Split the root page of a btree.
435 *
436 * @param t tree
437 * @param h root page
438 * @param lp pointer to left page pointer
439 * @param rp pointer to right page pointer
440 * @param skip pointer to index to leave open
441 * @param ilen insert length
442 *
443 * @return
444 * Pointer to page in which to insert or @CODE{NULL} on error.
445 */
446 static PAGE *
447 bt_root(t, h, lp, rp, skip, ilen)
448 BTREE *t;
449 PAGE *h, **lp, **rp;
450 indx_t *skip;
451 size_t ilen;
452 {
453 PAGE *l, *r, *tp;
454 pgno_t lnpg, rnpg;
455
456 #ifdef STATISTICS
457 ++bt_split;
458 ++bt_rootsplit;
459 #endif
460 /* Put the new left and right pages for the split into place. */
461 if ((l = __bt_new(t, &lnpg)) == NULL ||
462 (r = __bt_new(t, &rnpg)) == NULL)
463 return (NULL);
464 l->pgno = lnpg;
465 r->pgno = rnpg;
466 l->nextpg = r->pgno;
467 r->prevpg = l->pgno;
468 l->prevpg = r->nextpg = P_INVALID;
469 l->lower = r->lower = BTDATAOFF;
470 l->upper = r->upper = t->bt_psize;
471 l->flags = r->flags = h->flags & P_TYPE;
472
473 /* Split the root page. */
474 tp = bt_psplit(t, h, l, r, skip, ilen);
475
476 *lp = l;
477 *rp = r;
478 return (tp);
479 }
480
481 /**
482 * BT_RROOT -- Fix up the recno root page after it has been split.
483 *
484 * @param t tree
485 * @param h root page
486 * @param l left page
487 * @param r right page
488 *
489 * @return
490 * #RET_ERROR, #RET_SUCCESS
491 */
492 static int
493 bt_rroot(t, h, l, r)
494 BTREE *t;
495 PAGE *h, *l, *r;
496 {
497 char *dest;
498
499 /* Insert the left and right keys, set the header information. */
500 h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
501 dest = (char *)h + h->upper;
502 WR_RINTERNAL(dest,
503 l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
504
505 h->linp[1] = h->upper -= NRINTERNAL;
506 dest = (char *)h + h->upper;
507 WR_RINTERNAL(dest,
508 r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
509
510 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
511
512 /* Unpin the root page, set to recno internal page. */
513 h->flags &= ~P_TYPE;
514 h->flags |= P_RINTERNAL;
515 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
516
517 return (RET_SUCCESS);
518 }
519
520 /**
521 * BT_BROOT -- Fix up the btree root page after it has been split.
522 *
523 * @param t tree
524 * @param h root page
525 * @param l left page
526 * @param r right page
527 *
528 * @return
529 * #RET_ERROR, #RET_SUCCESS
530 */
531 static int
532 bt_broot(t, h, l, r)
533 BTREE *t;
534 PAGE *h, *l, *r;
535 {
536 BINTERNAL *bi;
537 BLEAF *bl;
538 u_int32_t nbytes;
539 char *dest;
540
541 /*
542 * If the root page was a leaf page, change it into an internal page.
543 * We copy the key we split on (but not the key's data, in the case of
544 * a leaf page) to the new root page.
545 *
546 * The btree comparison code guarantees that the left-most key on any
547 * level of the tree is never used, so it doesn't need to be filled in.
548 */
549 nbytes = NBINTERNAL(0);
550 h->linp[0] = h->upper = t->bt_psize - nbytes;
551 dest = (char *)h + h->upper;
552 WR_BINTERNAL(dest, 0, l->pgno, 0);
553
554 switch (h->flags & P_TYPE) {
555 case P_BLEAF:
556 bl = GETBLEAF(r, 0);
557 nbytes = NBINTERNAL(bl->ksize);
558 h->linp[1] = h->upper -= nbytes;
559 dest = (char *)h + h->upper;
560 WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
561 memmove(dest, bl->bytes, bl->ksize);
562
563 /*
564 * If the key is on an overflow page, mark the overflow chain
565 * so it isn't deleted when the leaf copy of the key is deleted.
566 */
567 if (bl->flags & P_BIGKEY &&
568 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
569 return (RET_ERROR);
570 break;
571 case P_BINTERNAL:
572 bi = GETBINTERNAL(r, 0);
573 nbytes = NBINTERNAL(bi->ksize);
574 h->linp[1] = h->upper -= nbytes;
575 dest = (char *)h + h->upper;
576 memmove(dest, bi, nbytes);
577 ((BINTERNAL *)dest)->pgno = r->pgno;
578 break;
579 default:
580 abort();
581 }
582
583 /* There are two keys on the page. */
584 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
585
586 /* Unpin the root page, set to btree internal page. */
587 h->flags &= ~P_TYPE;
588 h->flags |= P_BINTERNAL;
589 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
590
591 return (RET_SUCCESS);
592 }
593
594 /**
595 * BT_PSPLIT -- Do the real work of splitting the page.
596 *
597 * @param t tree
598 * @param h page to be split
599 * @param l page to put lower half of data
600 * @param r page to put upper half of data
601 * @param pskip pointer to index to leave open
602 * @param ilen insert length
603 *
604 * @return
605 * Pointer to page in which to insert.
606 */
607 static PAGE *
608 bt_psplit(t, h, l, r, pskip, ilen)
609 BTREE *t;
610 PAGE *h, *l, *r;
611 indx_t *pskip;
612 size_t ilen;
613 {
614 BINTERNAL *bi;
615 BLEAF *bl;
616 CURSOR *c;
617 RLEAF *rl;
618 PAGE *rval;
619 void *src = NULL;
620 indx_t full, half, nxt, off, skip, top, used;
621 u_int32_t nbytes;
622 int bigkeycnt, isbigkey;
623
624 /*
625 * Split the data to the left and right pages. Leave the skip index
626 * open. Additionally, make some effort not to split on an overflow
627 * key. This makes internal page processing faster and can save
628 * space as overflow keys used by internal pages are never deleted.
629 */
630 bigkeycnt = 0;
631 skip = *pskip;
632 full = t->bt_psize - BTDATAOFF;
633 half = full / 2;
634 used = 0;
635 for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
636 if (skip == off) {
637 nbytes = ilen;
638 isbigkey = 0; /* XXX: not really known. */
639 } else
640 switch (h->flags & P_TYPE) {
641 case P_BINTERNAL:
642 src = bi = GETBINTERNAL(h, nxt);
643 nbytes = NBINTERNAL(bi->ksize);
644 isbigkey = bi->flags & P_BIGKEY;
645 break;
646 case P_BLEAF:
647 src = bl = GETBLEAF(h, nxt);
648 nbytes = NBLEAF(bl);
649 isbigkey = bl->flags & P_BIGKEY;
650 break;
651 case P_RINTERNAL:
652 src = GETRINTERNAL(h, nxt);
653 nbytes = NRINTERNAL;
654 isbigkey = 0;
655 break;
656 case P_RLEAF:
657 src = rl = GETRLEAF(h, nxt);
658 nbytes = NRLEAF(rl);
659 isbigkey = 0;
660 break;
661 default:
662 abort();
663 }
664
665 /*
666 * If the key/data pairs are substantial fractions of the max
667 * possible size for the page, it's possible to get situations
668 * where we decide to try and copy too much onto the left page.
669 * Make sure that doesn't happen.
670 */
671 if ((skip <= off &&
672 used + nbytes + sizeof(indx_t) >= full) || nxt == top - 1) {
673 --off;
674 break;
675 }
676
677 /* Copy the key/data pair, if not the skipped index. */
678 if (skip != off) {
679 ++nxt;
680
681 l->linp[off] = l->upper -= nbytes;
682 memmove((char *)l + l->upper, src, nbytes);
683 }
684
685 used += nbytes + sizeof(indx_t);
686 if (used >= half) {
687 if (!isbigkey || bigkeycnt == 3)
688 break;
689 else
690 ++bigkeycnt;
691 }
692 }
693
694 /*
695 * Off is the last offset that's valid for the left page.
696 * Nxt is the first offset to be placed on the right page.
697 */
698 l->lower += (off + 1) * sizeof(indx_t);
699
700 /*
701 * If splitting the page that the cursor was on, the cursor has to be
702 * adjusted to point to the same record as before the split. If the
703 * cursor is at or past the skipped slot, the cursor is incremented by
704 * one. If the cursor is on the right page, it is decremented by the
705 * number of records split to the left page.
706 */
707 c = &t->bt_cursor;
708 if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
709 if (c->pg.index >= skip)
710 ++c->pg.index;
711 if (c->pg.index < nxt) /* Left page. */
712 c->pg.pgno = l->pgno;
713 else { /* Right page. */
714 c->pg.pgno = r->pgno;
715 c->pg.index -= nxt;
716 }
717 }
718
719 /*
720 * If the skipped index was on the left page, just return that page.
721 * Otherwise, adjust the skip index to reflect the new position on
722 * the right page.
723 */
724 if (skip <= off) {
725 skip = 0;
726 rval = l;
727 } else {
728 rval = r;
729 *pskip -= nxt;
730 }
731
732 for (off = 0; nxt < top; ++off) {
733 if (skip == nxt) {
734 ++off;
735 skip = 0;
736 }
737 switch (h->flags & P_TYPE) {
738 case P_BINTERNAL:
739 src = bi = GETBINTERNAL(h, nxt);
740 nbytes = NBINTERNAL(bi->ksize);
741 break;
742 case P_BLEAF:
743 src = bl = GETBLEAF(h, nxt);
744 nbytes = NBLEAF(bl);
745 break;
746 case P_RINTERNAL:
747 src = GETRINTERNAL(h, nxt);
748 nbytes = NRINTERNAL;
749 break;
750 case P_RLEAF:
751 src = rl = GETRLEAF(h, nxt);
752 nbytes = NRLEAF(rl);
753 break;
754 default:
755 abort();
756 }
757 ++nxt;
758 r->linp[off] = r->upper -= nbytes;
759 memmove((char *)r + r->upper, src, nbytes);
760 }
761 r->lower += off * sizeof(indx_t);
762
763 /* If the key is being appended to the page, adjust the index. */
764 if (skip == top)
765 r->lower += sizeof(indx_t);
766
767 return (rval);
768 }
769
770 /**
771 * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
772 *
773 * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
774 * record that references them gets deleted. Chains pointed to by internal
775 * pages never get deleted. This routine marks a chain as pointed to by an
776 * internal page.
777 *
778 * @param t tree
779 * @param pg page number of first page in the chain.
780 *
781 * @return
782 * #RET_SUCCESS, #RET_ERROR.
783 */
784 static int
785 bt_preserve(t, pg)
786 BTREE *t;
787 pgno_t pg;
788 {
789 PAGE *h;
790
791 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
792 return (RET_ERROR);
793 h->flags |= P_PRESERVE;
794 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
795 return (RET_SUCCESS);
796 }
797
798 /**
799 * REC_TOTAL -- Return the number of recno entries below a page.
800 *
801 * @param h page
802 *
803 * @return
804 * The number of recno entries below a page.
805 *
806 * @par XXX
807 * These values could be set by the #bt_psplit routine. The problem is that the
808 * entry has to be popped off of the stack etc. or the values have to be passed
809 * all the way back to #bt_split/#bt_rroot and it's not very clean.
810 */
811 static recno_t
812 rec_total(h)
813 PAGE *h;
814 {
815 recno_t recs;
816 indx_t nxt, top;
817
818 for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
819 recs += GETRINTERNAL(h, nxt)->nrecs;
820 return (recs);
821 }
/* */