root/libdb/bt_split.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. __bt_split
  2. bt_page
  3. bt_root
  4. bt_rroot
  5. bt_broot
  6. bt_psplit
  7. bt_preserve
  8. rec_total

   1 /*-
   2  * Copyright (c) 1990, 1993, 1994
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * Mike Olson.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 3. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  */
  32 
  33 #if defined(LIBC_SCCS) && !defined(lint)
  34 static char sccsid[] = "@(#)bt_split.c  8.9 (Berkeley) 7/26/94";
  35 #endif /* LIBC_SCCS and not lint */
  36 
  37 #ifdef HAVE_CONFIG_H
  38 #include <config.h>
  39 #endif
  40 #include <sys/types.h>
  41 #ifdef HAVE_LIMITS_H
  42 #include <limits.h>
  43 #endif
  44 #include <stdio.h>
  45 #ifdef STDC_HEADERS
  46 #include <stdlib.h>
  47 #endif
  48 #ifdef HAVE_STRING_H
  49 #include <string.h>
  50 #else
  51 #include <strings.h>
  52 #endif
  53 
  54 #include "db.h"
  55 #include "btree.h"
  56 
  57 static int       bt_broot(BTREE *, PAGE *, PAGE *, PAGE *);
  58 static PAGE     *bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
  59 static int       bt_preserve(BTREE *, pgno_t);
  60 static PAGE     *bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t);
  61 static PAGE     *bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
  62 static int       bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *);
  63 static recno_t   rec_total(PAGE *);
  64 
  65 u_long  bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
  66 
  67 /**
  68  * __BT_SPLIT -- Split the tree.
  69  *
  70  *      @param t        tree
  71  *      @param sp       page to split
  72  *      @param key      key to insert
  73  *      @param data     data to insert
  74  *      @param flags    #P_BIGKEY/#P_BIGDATA flags
  75  *      @param ilen     insert length
  76  *      @param argskip  index to leave open
  77  *
  78  * @return
  79  *      #RET_ERROR, #RET_SUCCESS
  80  */
  81 int
  82 __bt_split(t, sp, key, data, flags, ilen, argskip)
  83         BTREE *t;
  84         PAGE *sp;
  85         const DBT *key, *data;
  86         int flags;
  87         size_t ilen;
  88         u_int32_t argskip;
  89 {
  90         BINTERNAL *bi = NULL;
  91         BLEAF *bl = NULL, *tbl;
  92         DBT a, b;
  93         EPGNO *parent;
  94         PAGE *h, *l, *r, *lchild, *rchild;
  95         indx_t nxtindex;
  96         u_int16_t skip;
  97         u_int32_t n, nbytes, nksize = 0;
  98         int parentsplit;
  99         char *dest;
 100 
 101         /*
 102          * Split the page into two pages, l and r.  The split routines return
 103          * a pointer to the page into which the key should be inserted and with
 104          * skip set to the offset which should be used.  Additionally, l and r
 105          * are pinned.
 106          */
 107         skip = argskip;
 108         h = sp->pgno == P_ROOT ?
 109             bt_root(t, sp, &l, &r, &skip, ilen) :
 110             bt_page(t, sp, &l, &r, &skip, ilen);
 111         if (h == NULL)
 112                 return (RET_ERROR);
 113 
 114         /*
 115          * Insert the new key/data pair into the leaf page.  (Key inserts
 116          * always cause a leaf page to split first.)
 117          */
 118         h->linp[skip] = h->upper -= ilen;
 119         dest = (char *)h + h->upper;
 120         if (F_ISSET(t, R_RECNO))
 121                 WR_RLEAF(dest, data, flags)
 122         else
 123                 WR_BLEAF(dest, key, data, flags)
 124 
 125         /* If the root page was split, make it look right. */
 126         if (sp->pgno == P_ROOT &&
 127             (F_ISSET(t, R_RECNO) ?
 128             bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
 129                 goto err2;
 130 
 131         /*
 132          * Now we walk the parent page stack -- a LIFO stack of the pages that
 133          * were traversed when we searched for the page that split.  Each stack
 134          * entry is a page number and a page index offset.  The offset is for
 135          * the page traversed on the search.  We've just split a page, so we
 136          * have to insert a new key into the parent page.
 137          *
 138          * If the insert into the parent page causes it to split, may have to
 139          * continue splitting all the way up the tree.  We stop if the root
 140          * splits or the page inserted into didn't have to split to hold the
 141          * new key.  Some algorithms replace the key for the old page as well
 142          * as the new page.  We don't, as there's no reason to believe that the
 143          * first key on the old page is any better than the key we have, and,
 144          * in the case of a key being placed at index 0 causing the split, the
 145          * key is unavailable.
 146          *
 147          * There are a maximum of 5 pages pinned at any time.  We keep the left
 148          * and right pages pinned while working on the parent.   The 5 are the
 149          * two children, left parent and right parent (when the parent splits)
 150          * and the root page or the overflow key page when calling bt_preserve.
 151          * This code must make sure that all pins are released other than the
 152          * root page or overflow page which is unlocked elsewhere.
 153          */
 154         while ((parent = BT_POP(t)) != NULL) {
 155                 lchild = l;
 156                 rchild = r;
 157 
 158                 /* Get the parent page. */
 159                 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
 160                         goto err2;
 161 
 162                 /*
 163                  * The new key goes ONE AFTER the index, because the split
 164                  * was to the right.
 165                  */
 166                 skip = parent->index + 1;
 167 
 168                 /*
 169                  * Calculate the space needed on the parent page.
 170                  *
 171                  * Prefix trees: space hack when inserting into BINTERNAL
 172                  * pages.  Retain only what's needed to distinguish between
 173                  * the new entry and the LAST entry on the page to its left.
 174                  * If the keys compare equal, retain the entire key.  Note,
 175                  * we don't touch overflow keys, and the entire key must be
 176                  * retained for the next-to-left most key on the leftmost
 177                  * page of each level, or the search will fail.  Applicable
 178                  * ONLY to internal pages that have leaf pages as children.
 179                  * Further reduction of the key between pairs of internal
 180                  * pages loses too much information.
 181                  */
 182                 switch (rchild->flags & P_TYPE) {
 183                 case P_BINTERNAL:
 184                         bi = GETBINTERNAL(rchild, 0);
 185                         nbytes = NBINTERNAL(bi->ksize);
 186                         break;
 187                 case P_BLEAF:
 188                         bl = GETBLEAF(rchild, 0);
 189                         nbytes = NBINTERNAL(bl->ksize);
 190                         if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
 191                             (h->prevpg != P_INVALID || skip > 1)) {
 192                                 tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
 193                                 a.size = tbl->ksize;
 194                                 a.data = tbl->bytes;
 195                                 b.size = bl->ksize;
 196                                 b.data = bl->bytes;
 197                                 nksize = t->bt_pfx(&a, &b);
 198                                 n = NBINTERNAL(nksize);
 199                                 if (n < nbytes) {
 200 #ifdef STATISTICS
 201                                         bt_pfxsaved += nbytes - n;
 202 #endif
 203                                         nbytes = n;
 204                                 } else
 205                                         nksize = 0;
 206                         } else
 207                                 nksize = 0;
 208                         break;
 209                 case P_RINTERNAL:
 210                 case P_RLEAF:
 211                         nbytes = NRINTERNAL;
 212                         break;
 213                 default:
 214                         abort();
 215                 }
 216 
 217                 /* Split the parent page if necessary or shift the indices. */
 218                 if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
 219                         sp = h;
 220                         h = h->pgno == P_ROOT ?
 221                             bt_root(t, h, &l, &r, &skip, nbytes) :
 222                             bt_page(t, h, &l, &r, &skip, nbytes);
 223                         if (h == NULL)
 224                                 goto err1;
 225                         parentsplit = 1;
 226                 } else {
 227                         if (skip < (nxtindex = NEXTINDEX(h)))
 228                                 memmove(h->linp + skip + 1, h->linp + skip,
 229                                     (nxtindex - skip) * sizeof(indx_t));
 230                         h->lower += sizeof(indx_t);
 231                         parentsplit = 0;
 232                 }
 233 
 234                 /* Insert the key into the parent page. */
 235                 switch (rchild->flags & P_TYPE) {
 236                 case P_BINTERNAL:
 237                         h->linp[skip] = h->upper -= nbytes;
 238                         dest = (char *)h + h->linp[skip];
 239                         memmove(dest, bi, nbytes);
 240                         ((BINTERNAL *)dest)->pgno = rchild->pgno;
 241                         break;
 242                 case P_BLEAF:
 243                         h->linp[skip] = h->upper -= nbytes;
 244                         dest = (char *)h + h->linp[skip];
 245                         WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
 246                             rchild->pgno, bl->flags & P_BIGKEY);
 247                         memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
 248                         if (bl->flags & P_BIGKEY &&
 249                             bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
 250                                 goto err1;
 251                         break;
 252                 case P_RINTERNAL:
 253                         /*
 254                          * Update the left page count.  If split
 255                          * added at index 0, fix the correct page.
 256                          */
 257                         if (skip > 0)
 258                                 dest = (char *)h + h->linp[skip - 1];
 259                         else
 260                                 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
 261                         ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
 262                         ((RINTERNAL *)dest)->pgno = lchild->pgno;
 263 
 264                         /* Update the right page count. */
 265                         h->linp[skip] = h->upper -= nbytes;
 266                         dest = (char *)h + h->linp[skip];
 267                         ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
 268                         ((RINTERNAL *)dest)->pgno = rchild->pgno;
 269                         break;
 270                 case P_RLEAF:
 271                         /*
 272                          * Update the left page count.  If split
 273                          * added at index 0, fix the correct page.
 274                          */
 275                         if (skip > 0)
 276                                 dest = (char *)h + h->linp[skip - 1];
 277                         else
 278                                 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
 279                         ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
 280                         ((RINTERNAL *)dest)->pgno = lchild->pgno;
 281 
 282                         /* Update the right page count. */
 283                         h->linp[skip] = h->upper -= nbytes;
 284                         dest = (char *)h + h->linp[skip];
 285                         ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
 286                         ((RINTERNAL *)dest)->pgno = rchild->pgno;
 287                         break;
 288                 default:
 289                         abort();
 290                 }
 291 
 292                 /* Unpin the held pages. */
 293                 if (!parentsplit) {
 294                         mpool_put(t->bt_mp, h, MPOOL_DIRTY);
 295                         break;
 296                 }
 297 
 298                 /* If the root page was split, make it look right. */
 299                 if (sp->pgno == P_ROOT &&
 300                     (F_ISSET(t, R_RECNO) ?
 301                     bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
 302                         goto err1;
 303 
 304                 mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
 305                 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
 306         }
 307 
 308         /* Unpin the held pages. */
 309         mpool_put(t->bt_mp, l, MPOOL_DIRTY);
 310         mpool_put(t->bt_mp, r, MPOOL_DIRTY);
 311 
 312         /* Clear any pages left on the stack. */
 313         return (RET_SUCCESS);
 314 
 315         /*
 316          * If something fails in the above loop we were already walking back
 317          * up the tree and the tree is now inconsistent.  Nothing much we can
 318          * do about it but release any memory we're holding.
 319          */
 320 err1:   mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
 321         mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
 322 
 323 err2:   mpool_put(t->bt_mp, l, 0);
 324         mpool_put(t->bt_mp, r, 0);
 325         __dbpanic(t->bt_dbp);
 326         return (RET_ERROR);
 327 }
 328 
 329 /**
 330  * BT_PAGE -- Split a non-root page of a btree.
 331  *
 332  *      @param t        tree
 333  *      @param h        root page
 334  *      @param lp       pointer to left page pointer
 335  *      @param rp       pointer to right page pointer
 336  *      @param skip     pointer to index to leave open
 337  *      @param ilen     insert length
 338  *
 339  * @return
 340  *      Pointer to page in which to insert or @CODE{NULL} on error.
 341  */
 342 static PAGE *
 343 bt_page(t, h, lp, rp, skip, ilen)
 344         BTREE *t;
 345         PAGE *h, **lp, **rp;
 346         indx_t *skip;
 347         size_t ilen;
 348 {
 349         PAGE *l, *r, *tp;
 350         pgno_t npg;
 351 
 352 #ifdef STATISTICS
 353         ++bt_split;
 354 #endif
 355         /* Put the new right page for the split into place. */
 356         if ((r = __bt_new(t, &npg)) == NULL)
 357                 return (NULL);
 358         r->pgno = npg;
 359         r->lower = BTDATAOFF;
 360         r->upper = t->bt_psize;
 361         r->nextpg = h->nextpg;
 362         r->prevpg = h->pgno;
 363         r->flags = h->flags & P_TYPE;
 364 
 365         /*
 366          * If we're splitting the last page on a level because we're appending
 367          * a key to it (skip is NEXTINDEX()), it's likely that the data is
 368          * sorted.  Adding an empty page on the side of the level is less work
 369          * and can push the fill factor much higher than normal.  If we're
 370          * wrong it's no big deal, we'll just do the split the right way next
 371          * time.  It may look like it's equally easy to do a similar hack for
 372          * reverse sorted data, that is, split the tree left, but it's not.
 373          * Don't even try.
 374          */
 375         if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
 376 #ifdef STATISTICS
 377                 ++bt_sortsplit;
 378 #endif
 379                 h->nextpg = r->pgno;
 380                 r->lower = BTDATAOFF + sizeof(indx_t);
 381                 *skip = 0;
 382                 *lp = h;
 383                 *rp = r;
 384                 return (r);
 385         }
 386 
 387         /* Put the new left page for the split into place. */
 388         if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) {
 389                 mpool_put(t->bt_mp, r, 0);
 390                 return (NULL);
 391         }
 392 #ifdef PURIFY
 393         memset(l, 0xff, t->bt_psize);
 394 #endif
 395         l->pgno = h->pgno;
 396         l->nextpg = r->pgno;
 397         l->prevpg = h->prevpg;
 398         l->lower = BTDATAOFF;
 399         l->upper = t->bt_psize;
 400         l->flags = h->flags & P_TYPE;
 401 
 402         /* Fix up the previous pointer of the page after the split page. */
 403         if (h->nextpg != P_INVALID) {
 404                 if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
 405                         free(l);
 406                         /* XXX mpool_free(t->bt_mp, r->pgno); */
 407                         return (NULL);
 408                 }
 409                 tp->prevpg = r->pgno;
 410                 mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
 411         }
 412 
 413         /*
 414          * Split right.  The key/data pairs aren't sorted in the btree page so
 415          * it's simpler to copy the data from the split page onto two new pages
 416          * instead of copying half the data to the right page and compacting
 417          * the left page in place.  Since the left page can't change, we have
 418          * to swap the original and the allocated left page after the split.
 419          */
 420         tp = bt_psplit(t, h, l, r, skip, ilen);
 421 
 422         /* Move the new left page onto the old left page. */
 423         memmove(h, l, t->bt_psize);
 424         if (tp == l)
 425                 tp = h;
 426         free(l);
 427 
 428         *lp = h;
 429         *rp = r;
 430         return (tp);
 431 }
 432 
 433 /**
 434  * BT_ROOT -- Split the root page of a btree.
 435  *
 436  *      @param t        tree
 437  *      @param h        root page
 438  *      @param lp       pointer to left page pointer
 439  *      @param rp       pointer to right page pointer
 440  *      @param skip     pointer to index to leave open
 441  *      @param ilen     insert length
 442  *
 443  * @return
 444  *      Pointer to page in which to insert or @CODE{NULL} on error.
 445  */
 446 static PAGE *
 447 bt_root(t, h, lp, rp, skip, ilen)
 448         BTREE *t;
 449         PAGE *h, **lp, **rp;
 450         indx_t *skip;
 451         size_t ilen;
 452 {
 453         PAGE *l, *r, *tp;
 454         pgno_t lnpg, rnpg;
 455 
 456 #ifdef STATISTICS
 457         ++bt_split;
 458         ++bt_rootsplit;
 459 #endif
 460         /* Put the new left and right pages for the split into place. */
 461         if ((l = __bt_new(t, &lnpg)) == NULL ||
 462             (r = __bt_new(t, &rnpg)) == NULL)
 463                 return (NULL);
 464         l->pgno = lnpg;
 465         r->pgno = rnpg;
 466         l->nextpg = r->pgno;
 467         r->prevpg = l->pgno;
 468         l->prevpg = r->nextpg = P_INVALID;
 469         l->lower = r->lower = BTDATAOFF;
 470         l->upper = r->upper = t->bt_psize;
 471         l->flags = r->flags = h->flags & P_TYPE;
 472 
 473         /* Split the root page. */
 474         tp = bt_psplit(t, h, l, r, skip, ilen);
 475 
 476         *lp = l;
 477         *rp = r;
 478         return (tp);
 479 }
 480 
 481 /**
 482  * BT_RROOT -- Fix up the recno root page after it has been split.
 483  *
 484  *      @param t        tree
 485  *      @param h        root page
 486  *      @param l        left page
 487  *      @param r        right page
 488  *
 489  * @return
 490  *      #RET_ERROR, #RET_SUCCESS
 491  */
 492 static int
 493 bt_rroot(t, h, l, r)
 494         BTREE *t;
 495         PAGE *h, *l, *r;
 496 {
 497         char *dest;
 498 
 499         /* Insert the left and right keys, set the header information. */
 500         h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
 501         dest = (char *)h + h->upper;
 502         WR_RINTERNAL(dest,
 503             l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
 504 
 505         h->linp[1] = h->upper -= NRINTERNAL;
 506         dest = (char *)h + h->upper;
 507         WR_RINTERNAL(dest,
 508             r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
 509 
 510         h->lower = BTDATAOFF + 2 * sizeof(indx_t);
 511 
 512         /* Unpin the root page, set to recno internal page. */
 513         h->flags &= ~P_TYPE;
 514         h->flags |= P_RINTERNAL;
 515         mpool_put(t->bt_mp, h, MPOOL_DIRTY);
 516 
 517         return (RET_SUCCESS);
 518 }
 519 
 520 /**
 521  * BT_BROOT -- Fix up the btree root page after it has been split.
 522  *
 523  *      @param t        tree
 524  *      @param h        root page
 525  *      @param l        left page
 526  *      @param r        right page
 527  *
 528  * @return
 529  *      #RET_ERROR, #RET_SUCCESS
 530  */
 531 static int
 532 bt_broot(t, h, l, r)
 533         BTREE *t;
 534         PAGE *h, *l, *r;
 535 {
 536         BINTERNAL *bi;
 537         BLEAF *bl;
 538         u_int32_t nbytes;
 539         char *dest;
 540 
 541         /*
 542          * If the root page was a leaf page, change it into an internal page.
 543          * We copy the key we split on (but not the key's data, in the case of
 544          * a leaf page) to the new root page.
 545          *
 546          * The btree comparison code guarantees that the left-most key on any
 547          * level of the tree is never used, so it doesn't need to be filled in.
 548          */
 549         nbytes = NBINTERNAL(0);
 550         h->linp[0] = h->upper = t->bt_psize - nbytes;
 551         dest = (char *)h + h->upper;
 552         WR_BINTERNAL(dest, 0, l->pgno, 0);
 553 
 554         switch (h->flags & P_TYPE) {
 555         case P_BLEAF:
 556                 bl = GETBLEAF(r, 0);
 557                 nbytes = NBINTERNAL(bl->ksize);
 558                 h->linp[1] = h->upper -= nbytes;
 559                 dest = (char *)h + h->upper;
 560                 WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
 561                 memmove(dest, bl->bytes, bl->ksize);
 562 
 563                 /*
 564                  * If the key is on an overflow page, mark the overflow chain
 565                  * so it isn't deleted when the leaf copy of the key is deleted.
 566                  */
 567                 if (bl->flags & P_BIGKEY &&
 568                     bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
 569                         return (RET_ERROR);
 570                 break;
 571         case P_BINTERNAL:
 572                 bi = GETBINTERNAL(r, 0);
 573                 nbytes = NBINTERNAL(bi->ksize);
 574                 h->linp[1] = h->upper -= nbytes;
 575                 dest = (char *)h + h->upper;
 576                 memmove(dest, bi, nbytes);
 577                 ((BINTERNAL *)dest)->pgno = r->pgno;
 578                 break;
 579         default:
 580                 abort();
 581         }
 582 
 583         /* There are two keys on the page. */
 584         h->lower = BTDATAOFF + 2 * sizeof(indx_t);
 585 
 586         /* Unpin the root page, set to btree internal page. */
 587         h->flags &= ~P_TYPE;
 588         h->flags |= P_BINTERNAL;
 589         mpool_put(t->bt_mp, h, MPOOL_DIRTY);
 590 
 591         return (RET_SUCCESS);
 592 }
 593 
 594 /**
 595  * BT_PSPLIT -- Do the real work of splitting the page.
 596  *
 597  *      @param t        tree
 598  *      @param h        page to be split
 599  *      @param l        page to put lower half of data
 600  *      @param r        page to put upper half of data
 601  *      @param pskip    pointer to index to leave open
 602  *      @param ilen     insert length
 603  *
 604  * @return
 605  *      Pointer to page in which to insert.
 606  */
 607 static PAGE *
 608 bt_psplit(t, h, l, r, pskip, ilen)
 609         BTREE *t;
 610         PAGE *h, *l, *r;
 611         indx_t *pskip;
 612         size_t ilen;
 613 {
 614         BINTERNAL *bi;
 615         BLEAF *bl;
 616         CURSOR *c;
 617         RLEAF *rl;
 618         PAGE *rval;
 619         void *src = NULL;
 620         indx_t full, half, nxt, off, skip, top, used;
 621         u_int32_t nbytes;
 622         int bigkeycnt, isbigkey;
 623 
 624         /*
 625          * Split the data to the left and right pages.  Leave the skip index
 626          * open.  Additionally, make some effort not to split on an overflow
 627          * key.  This makes internal page processing faster and can save
 628          * space as overflow keys used by internal pages are never deleted.
 629          */
 630         bigkeycnt = 0;
 631         skip = *pskip;
 632         full = t->bt_psize - BTDATAOFF;
 633         half = full / 2;
 634         used = 0;
 635         for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
 636                 if (skip == off) {
 637                         nbytes = ilen;
 638                         isbigkey = 0;           /* XXX: not really known. */
 639                 } else
 640                         switch (h->flags & P_TYPE) {
 641                         case P_BINTERNAL:
 642                                 src = bi = GETBINTERNAL(h, nxt);
 643                                 nbytes = NBINTERNAL(bi->ksize);
 644                                 isbigkey = bi->flags & P_BIGKEY;
 645                                 break;
 646                         case P_BLEAF:
 647                                 src = bl = GETBLEAF(h, nxt);
 648                                 nbytes = NBLEAF(bl);
 649                                 isbigkey = bl->flags & P_BIGKEY;
 650                                 break;
 651                         case P_RINTERNAL:
 652                                 src = GETRINTERNAL(h, nxt);
 653                                 nbytes = NRINTERNAL;
 654                                 isbigkey = 0;
 655                                 break;
 656                         case P_RLEAF:
 657                                 src = rl = GETRLEAF(h, nxt);
 658                                 nbytes = NRLEAF(rl);
 659                                 isbigkey = 0;
 660                                 break;
 661                         default:
 662                                 abort();
 663                         }
 664 
 665                 /*
 666                  * If the key/data pairs are substantial fractions of the max
 667                  * possible size for the page, it's possible to get situations
 668                  * where we decide to try and copy too much onto the left page.
 669                  * Make sure that doesn't happen.
 670                  */
 671                 if ((skip <= off &&
 672                     used + nbytes + sizeof(indx_t) >= full) || nxt == top - 1) {
 673                         --off;
 674                         break;
 675                 }
 676 
 677                 /* Copy the key/data pair, if not the skipped index. */
 678                 if (skip != off) {
 679                         ++nxt;
 680 
 681                         l->linp[off] = l->upper -= nbytes;
 682                         memmove((char *)l + l->upper, src, nbytes);
 683                 }
 684 
 685                 used += nbytes + sizeof(indx_t);
 686                 if (used >= half) {
 687                         if (!isbigkey || bigkeycnt == 3)
 688                                 break;
 689                         else
 690                                 ++bigkeycnt;
 691                 }
 692         }
 693 
 694         /*
 695          * Off is the last offset that's valid for the left page.
 696          * Nxt is the first offset to be placed on the right page.
 697          */
 698         l->lower += (off + 1) * sizeof(indx_t);
 699 
 700         /*
 701          * If splitting the page that the cursor was on, the cursor has to be
 702          * adjusted to point to the same record as before the split.  If the
 703          * cursor is at or past the skipped slot, the cursor is incremented by
 704          * one.  If the cursor is on the right page, it is decremented by the
 705          * number of records split to the left page.
 706          */
 707         c = &t->bt_cursor;
 708         if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
 709                 if (c->pg.index >= skip)
 710                         ++c->pg.index;
 711                 if (c->pg.index < nxt)                  /* Left page. */
 712                         c->pg.pgno = l->pgno;
 713                 else {                                  /* Right page. */
 714                         c->pg.pgno = r->pgno;
 715                         c->pg.index -= nxt;
 716                 }
 717         }
 718 
 719         /*
 720          * If the skipped index was on the left page, just return that page.
 721          * Otherwise, adjust the skip index to reflect the new position on
 722          * the right page.
 723          */
 724         if (skip <= off) {
 725                 skip = 0;
 726                 rval = l;
 727         } else {
 728                 rval = r;
 729                 *pskip -= nxt;
 730         }
 731 
 732         for (off = 0; nxt < top; ++off) {
 733                 if (skip == nxt) {
 734                         ++off;
 735                         skip = 0;
 736                 }
 737                 switch (h->flags & P_TYPE) {
 738                 case P_BINTERNAL:
 739                         src = bi = GETBINTERNAL(h, nxt);
 740                         nbytes = NBINTERNAL(bi->ksize);
 741                         break;
 742                 case P_BLEAF:
 743                         src = bl = GETBLEAF(h, nxt);
 744                         nbytes = NBLEAF(bl);
 745                         break;
 746                 case P_RINTERNAL:
 747                         src = GETRINTERNAL(h, nxt);
 748                         nbytes = NRINTERNAL;
 749                         break;
 750                 case P_RLEAF:
 751                         src = rl = GETRLEAF(h, nxt);
 752                         nbytes = NRLEAF(rl);
 753                         break;
 754                 default:
 755                         abort();
 756                 }
 757                 ++nxt;
 758                 r->linp[off] = r->upper -= nbytes;
 759                 memmove((char *)r + r->upper, src, nbytes);
 760         }
 761         r->lower += off * sizeof(indx_t);
 762 
 763         /* If the key is being appended to the page, adjust the index. */
 764         if (skip == top)
 765                 r->lower += sizeof(indx_t);
 766 
 767         return (rval);
 768 }
 769 
 770 /**
 771  * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
 772  *
 773  * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
 774  * record that references them gets deleted.  Chains pointed to by internal
 775  * pages never get deleted.  This routine marks a chain as pointed to by an
 776  * internal page.
 777  *
 778  *      @param t        tree
 779  *      @param pg       page number of first page in the chain.
 780  *
 781  * @return
 782  *      #RET_SUCCESS, #RET_ERROR.
 783  */
 784 static int
 785 bt_preserve(t, pg)
 786         BTREE *t;
 787         pgno_t pg;
 788 {
 789         PAGE *h;
 790 
 791         if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
 792                 return (RET_ERROR);
 793         h->flags |= P_PRESERVE;
 794         mpool_put(t->bt_mp, h, MPOOL_DIRTY);
 795         return (RET_SUCCESS);
 796 }
 797 
 798 /**
 799  * REC_TOTAL -- Return the number of recno entries below a page.
 800  *
 801  *      @param h        page
 802  *
 803  * @return
 804  *      The number of recno entries below a page.
 805  *
 806  * @par XXX
 807  * These values could be set by the #bt_psplit routine.  The problem is that the
 808  * entry has to be popped off of the stack etc. or the values have to be passed
 809  * all the way back to #bt_split/#bt_rroot and it's not very clean.
 810  */
 811 static recno_t
 812 rec_total(h)
 813         PAGE *h;
 814 {
 815         recno_t recs;
 816         indx_t nxt, top;
 817 
 818         for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
 819                 recs += GETRINTERNAL(h, nxt)->nrecs;
 820         return (recs);
 821 }

/* [previous][next][first][last][top][bottom][index][help] */