• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements a external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
15 */
16 #include "btreeInt.h"
17 
18 /*
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
21 */
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
23 
24 /*
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
27 */
28 #if 0
29 int sqlite3BtreeTrace=1;  /* True to enable tracing */
30 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
34 
35 /*
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
38 **
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page.  If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
43 */
44 #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
45 
46 #ifndef SQLITE_OMIT_SHARED_CACHE
47 /*
48 ** A list of BtShared objects that are eligible for participation
49 ** in shared cache.  This variable has file scope during normal builds,
50 ** but the test harness needs to access it so we make it global for
51 ** test builds.
52 **
53 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
54 */
55 #ifdef SQLITE_TEST
56 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
57 #else
58 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
59 #endif
60 #endif /* SQLITE_OMIT_SHARED_CACHE */
61 
62 #ifndef SQLITE_OMIT_SHARED_CACHE
63 /*
64 ** Enable or disable the shared pager and schema features.
65 **
66 ** This routine has no effect on existing database connections.
67 ** The shared cache setting effects only future calls to
68 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
69 */
sqlite3_enable_shared_cache(int enable)70 int sqlite3_enable_shared_cache(int enable){
71   sqlite3GlobalConfig.sharedCacheEnabled = enable;
72   return SQLITE_OK;
73 }
74 #endif
75 
76 
77 
78 #ifdef SQLITE_OMIT_SHARED_CACHE
79   /*
80   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
81   ** and clearAllSharedCacheTableLocks()
82   ** manipulate entries in the BtShared.pLock linked list used to store
83   ** shared-cache table level locks. If the library is compiled with the
84   ** shared-cache feature disabled, then there is only ever one user
85   ** of each BtShared structure and so this locking is not necessary.
86   ** So define the lock related functions as no-ops.
87   */
88   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
89   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
90   #define clearAllSharedCacheTableLocks(a)
91   #define downgradeAllSharedCacheTableLocks(a)
92   #define hasSharedCacheTableLock(a,b,c,d) 1
93   #define hasReadConflicts(a, b) 0
94 #endif
95 
96 #ifndef SQLITE_OMIT_SHARED_CACHE
97 
98 #ifdef SQLITE_DEBUG
99 /*
100 **** This function is only used as part of an assert() statement. ***
101 **
102 ** Check to see if pBtree holds the required locks to read or write to the
103 ** table with root page iRoot.   Return 1 if it does and 0 if not.
104 **
105 ** For example, when writing to a table with root-page iRoot via
106 ** Btree connection pBtree:
107 **
108 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
109 **
110 ** When writing to an index that resides in a sharable database, the
111 ** caller should have first obtained a lock specifying the root page of
112 ** the corresponding table. This makes things a bit more complicated,
113 ** as this module treats each table as a separate structure. To determine
114 ** the table corresponding to the index being written, this
115 ** function has to search through the database schema.
116 **
117 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
118 ** hold a write-lock on the schema table (root page 1). This is also
119 ** acceptable.
120 */
hasSharedCacheTableLock(Btree * pBtree,Pgno iRoot,int isIndex,int eLockType)121 static int hasSharedCacheTableLock(
122   Btree *pBtree,         /* Handle that must hold lock */
123   Pgno iRoot,            /* Root page of b-tree */
124   int isIndex,           /* True if iRoot is the root of an index b-tree */
125   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
126 ){
127   Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
128   Pgno iTab = 0;
129   BtLock *pLock;
130 
131   /* If this database is not shareable, or if the client is reading
132   ** and has the read-uncommitted flag set, then no lock is required.
133   ** Return true immediately.
134   */
135   if( (pBtree->sharable==0)
136    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
137   ){
138     return 1;
139   }
140 
141   /* If the client is reading  or writing an index and the schema is
142   ** not loaded, then it is too difficult to actually check to see if
143   ** the correct locks are held.  So do not bother - just return true.
144   ** This case does not come up very often anyhow.
145   */
146   if( isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0) ){
147     return 1;
148   }
149 
150   /* Figure out the root-page that the lock should be held on. For table
151   ** b-trees, this is just the root page of the b-tree being read or
152   ** written. For index b-trees, it is the root page of the associated
153   ** table.  */
154   if( isIndex ){
155     HashElem *p;
156     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
157       Index *pIdx = (Index *)sqliteHashData(p);
158       if( pIdx->tnum==(int)iRoot ){
159         iTab = pIdx->pTable->tnum;
160       }
161     }
162   }else{
163     iTab = iRoot;
164   }
165 
166   /* Search for the required lock. Either a write-lock on root-page iTab, a
167   ** write-lock on the schema table, or (if the client is reading) a
168   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
169   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
170     if( pLock->pBtree==pBtree
171      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
172      && pLock->eLock>=eLockType
173     ){
174       return 1;
175     }
176   }
177 
178   /* Failed to find the required lock. */
179   return 0;
180 }
181 #endif /* SQLITE_DEBUG */
182 
183 #ifdef SQLITE_DEBUG
184 /*
185 **** This function may be used as part of assert() statements only. ****
186 **
187 ** Return true if it would be illegal for pBtree to write into the
188 ** table or index rooted at iRoot because other shared connections are
189 ** simultaneously reading that same table or index.
190 **
191 ** It is illegal for pBtree to write if some other Btree object that
192 ** shares the same BtShared object is currently reading or writing
193 ** the iRoot table.  Except, if the other Btree object has the
194 ** read-uncommitted flag set, then it is OK for the other object to
195 ** have a read cursor.
196 **
197 ** For example, before writing to any part of the table or index
198 ** rooted at page iRoot, one should call:
199 **
200 **    assert( !hasReadConflicts(pBtree, iRoot) );
201 */
hasReadConflicts(Btree * pBtree,Pgno iRoot)202 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
203   BtCursor *p;
204   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
205     if( p->pgnoRoot==iRoot
206      && p->pBtree!=pBtree
207      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
208     ){
209       return 1;
210     }
211   }
212   return 0;
213 }
214 #endif    /* #ifdef SQLITE_DEBUG */
215 
216 /*
217 ** Query to see if Btree handle p may obtain a lock of type eLock
218 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
219 ** SQLITE_OK if the lock may be obtained (by calling
220 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
221 */
querySharedCacheTableLock(Btree * p,Pgno iTab,u8 eLock)222 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
223   BtShared *pBt = p->pBt;
224   BtLock *pIter;
225 
226   assert( sqlite3BtreeHoldsMutex(p) );
227   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
228   assert( p->db!=0 );
229   assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
230 
231   /* If requesting a write-lock, then the Btree must have an open write
232   ** transaction on this file. And, obviously, for this to be so there
233   ** must be an open write transaction on the file itself.
234   */
235   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
236   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
237 
238   /* This routine is a no-op if the shared-cache is not enabled */
239   if( !p->sharable ){
240     return SQLITE_OK;
241   }
242 
243   /* If some other connection is holding an exclusive lock, the
244   ** requested lock may not be obtained.
245   */
246   if( pBt->pWriter!=p && pBt->isExclusive ){
247     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
248     return SQLITE_LOCKED_SHAREDCACHE;
249   }
250 
251   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
252     /* The condition (pIter->eLock!=eLock) in the following if(...)
253     ** statement is a simplification of:
254     **
255     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
256     **
257     ** since we know that if eLock==WRITE_LOCK, then no other connection
258     ** may hold a WRITE_LOCK on any table in this file (since there can
259     ** only be a single writer).
260     */
261     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
262     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
263     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
264       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
265       if( eLock==WRITE_LOCK ){
266         assert( p==pBt->pWriter );
267         pBt->isPending = 1;
268       }
269       return SQLITE_LOCKED_SHAREDCACHE;
270     }
271   }
272   return SQLITE_OK;
273 }
274 #endif /* !SQLITE_OMIT_SHARED_CACHE */
275 
276 #ifndef SQLITE_OMIT_SHARED_CACHE
277 /*
278 ** Add a lock on the table with root-page iTable to the shared-btree used
279 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
280 ** WRITE_LOCK.
281 **
282 ** This function assumes the following:
283 **
284 **   (a) The specified Btree object p is connected to a sharable
285 **       database (one with the BtShared.sharable flag set), and
286 **
287 **   (b) No other Btree objects hold a lock that conflicts
288 **       with the requested lock (i.e. querySharedCacheTableLock() has
289 **       already been called and returned SQLITE_OK).
290 **
291 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
292 ** is returned if a malloc attempt fails.
293 */
setSharedCacheTableLock(Btree * p,Pgno iTable,u8 eLock)294 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
295   BtShared *pBt = p->pBt;
296   BtLock *pLock = 0;
297   BtLock *pIter;
298 
299   assert( sqlite3BtreeHoldsMutex(p) );
300   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
301   assert( p->db!=0 );
302 
303   /* A connection with the read-uncommitted flag set will never try to
304   ** obtain a read-lock using this function. The only read-lock obtained
305   ** by a connection in read-uncommitted mode is on the sqlite_master
306   ** table, and that lock is obtained in BtreeBeginTrans().  */
307   assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
308 
309   /* This function should only be called on a sharable b-tree after it
310   ** has been determined that no other b-tree holds a conflicting lock.  */
311   assert( p->sharable );
312   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
313 
314   /* First search the list for an existing lock on this table. */
315   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
316     if( pIter->iTable==iTable && pIter->pBtree==p ){
317       pLock = pIter;
318       break;
319     }
320   }
321 
322   /* If the above search did not find a BtLock struct associating Btree p
323   ** with table iTable, allocate one and link it into the list.
324   */
325   if( !pLock ){
326     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
327     if( !pLock ){
328       return SQLITE_NOMEM;
329     }
330     pLock->iTable = iTable;
331     pLock->pBtree = p;
332     pLock->pNext = pBt->pLock;
333     pBt->pLock = pLock;
334   }
335 
336   /* Set the BtLock.eLock variable to the maximum of the current lock
337   ** and the requested lock. This means if a write-lock was already held
338   ** and a read-lock requested, we don't incorrectly downgrade the lock.
339   */
340   assert( WRITE_LOCK>READ_LOCK );
341   if( eLock>pLock->eLock ){
342     pLock->eLock = eLock;
343   }
344 
345   return SQLITE_OK;
346 }
347 #endif /* !SQLITE_OMIT_SHARED_CACHE */
348 
349 #ifndef SQLITE_OMIT_SHARED_CACHE
350 /*
351 ** Release all the table locks (locks obtained via calls to
352 ** the setSharedCacheTableLock() procedure) held by Btree object p.
353 **
354 ** This function assumes that Btree p has an open read or write
355 ** transaction. If it does not, then the BtShared.isPending variable
356 ** may be incorrectly cleared.
357 */
clearAllSharedCacheTableLocks(Btree * p)358 static void clearAllSharedCacheTableLocks(Btree *p){
359   BtShared *pBt = p->pBt;
360   BtLock **ppIter = &pBt->pLock;
361 
362   assert( sqlite3BtreeHoldsMutex(p) );
363   assert( p->sharable || 0==*ppIter );
364   assert( p->inTrans>0 );
365 
366   while( *ppIter ){
367     BtLock *pLock = *ppIter;
368     assert( pBt->isExclusive==0 || pBt->pWriter==pLock->pBtree );
369     assert( pLock->pBtree->inTrans>=pLock->eLock );
370     if( pLock->pBtree==p ){
371       *ppIter = pLock->pNext;
372       assert( pLock->iTable!=1 || pLock==&p->lock );
373       if( pLock->iTable!=1 ){
374         sqlite3_free(pLock);
375       }
376     }else{
377       ppIter = &pLock->pNext;
378     }
379   }
380 
381   assert( pBt->isPending==0 || pBt->pWriter );
382   if( pBt->pWriter==p ){
383     pBt->pWriter = 0;
384     pBt->isExclusive = 0;
385     pBt->isPending = 0;
386   }else if( pBt->nTransaction==2 ){
387     /* This function is called when Btree p is concluding its
388     ** transaction. If there currently exists a writer, and p is not
389     ** that writer, then the number of locks held by connections other
390     ** than the writer must be about to drop to zero. In this case
391     ** set the isPending flag to 0.
392     **
393     ** If there is not currently a writer, then BtShared.isPending must
394     ** be zero already. So this next line is harmless in that case.
395     */
396     pBt->isPending = 0;
397   }
398 }
399 
400 /*
401 ** This function changes all write-locks held by Btree p into read-locks.
402 */
downgradeAllSharedCacheTableLocks(Btree * p)403 static void downgradeAllSharedCacheTableLocks(Btree *p){
404   BtShared *pBt = p->pBt;
405   if( pBt->pWriter==p ){
406     BtLock *pLock;
407     pBt->pWriter = 0;
408     pBt->isExclusive = 0;
409     pBt->isPending = 0;
410     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
411       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
412       pLock->eLock = READ_LOCK;
413     }
414   }
415 }
416 
417 #endif /* SQLITE_OMIT_SHARED_CACHE */
418 
419 static void releasePage(MemPage *pPage);  /* Forward reference */
420 
421 /*
422 ***** This routine is used inside of assert() only ****
423 **
424 ** Verify that the cursor holds the mutex on its BtShared
425 */
426 #ifdef SQLITE_DEBUG
cursorHoldsMutex(BtCursor * p)427 static int cursorHoldsMutex(BtCursor *p){
428   return sqlite3_mutex_held(p->pBt->mutex);
429 }
430 #endif
431 
432 
433 #ifndef SQLITE_OMIT_INCRBLOB
434 /*
435 ** Invalidate the overflow page-list cache for cursor pCur, if any.
436 */
invalidateOverflowCache(BtCursor * pCur)437 static void invalidateOverflowCache(BtCursor *pCur){
438   assert( cursorHoldsMutex(pCur) );
439   sqlite3_free(pCur->aOverflow);
440   pCur->aOverflow = 0;
441 }
442 
443 /*
444 ** Invalidate the overflow page-list cache for all cursors opened
445 ** on the shared btree structure pBt.
446 */
invalidateAllOverflowCache(BtShared * pBt)447 static void invalidateAllOverflowCache(BtShared *pBt){
448   BtCursor *p;
449   assert( sqlite3_mutex_held(pBt->mutex) );
450   for(p=pBt->pCursor; p; p=p->pNext){
451     invalidateOverflowCache(p);
452   }
453 }
454 
455 /*
456 ** This function is called before modifying the contents of a table
457 ** to invalidate any incrblob cursors that are open on the
458 ** row or one of the rows being modified.
459 **
460 ** If argument isClearTable is true, then the entire contents of the
461 ** table is about to be deleted. In this case invalidate all incrblob
462 ** cursors open on any row within the table with root-page pgnoRoot.
463 **
464 ** Otherwise, if argument isClearTable is false, then the row with
465 ** rowid iRow is being replaced or deleted. In this case invalidate
466 ** only those incrblob cursors open on that specific row.
467 */
invalidateIncrblobCursors(Btree * pBtree,i64 iRow,int isClearTable)468 static void invalidateIncrblobCursors(
469   Btree *pBtree,          /* The database file to check */
470   i64 iRow,               /* The rowid that might be changing */
471   int isClearTable        /* True if all rows are being deleted */
472 ){
473   BtCursor *p;
474   BtShared *pBt = pBtree->pBt;
475   assert( sqlite3BtreeHoldsMutex(pBtree) );
476   for(p=pBt->pCursor; p; p=p->pNext){
477     if( p->isIncrblobHandle && (isClearTable || p->info.nKey==iRow) ){
478       p->eState = CURSOR_INVALID;
479     }
480   }
481 }
482 
483 #else
484   /* Stub functions when INCRBLOB is omitted */
485   #define invalidateOverflowCache(x)
486   #define invalidateAllOverflowCache(x)
487   #define invalidateIncrblobCursors(x,y,z)
488 #endif /* SQLITE_OMIT_INCRBLOB */
489 
490 /*
491 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
492 ** when a page that previously contained data becomes a free-list leaf
493 ** page.
494 **
495 ** The BtShared.pHasContent bitvec exists to work around an obscure
496 ** bug caused by the interaction of two useful IO optimizations surrounding
497 ** free-list leaf pages:
498 **
499 **   1) When all data is deleted from a page and the page becomes
500 **      a free-list leaf page, the page is not written to the database
501 **      (as free-list leaf pages contain no meaningful data). Sometimes
502 **      such a page is not even journalled (as it will not be modified,
503 **      why bother journalling it?).
504 **
505 **   2) When a free-list leaf page is reused, its content is not read
506 **      from the database or written to the journal file (why should it
507 **      be, if it is not at all meaningful?).
508 **
509 ** By themselves, these optimizations work fine and provide a handy
510 ** performance boost to bulk delete or insert operations. However, if
511 ** a page is moved to the free-list and then reused within the same
512 ** transaction, a problem comes up. If the page is not journalled when
513 ** it is moved to the free-list and it is also not journalled when it
514 ** is extracted from the free-list and reused, then the original data
515 ** may be lost. In the event of a rollback, it may not be possible
516 ** to restore the database to its original configuration.
517 **
518 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
519 ** moved to become a free-list leaf page, the corresponding bit is
520 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
521 ** optimization 2 above is omitted if the corresponding bit is already
522 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
523 ** at the end of every transaction.
524 */
btreeSetHasContent(BtShared * pBt,Pgno pgno)525 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
526   int rc = SQLITE_OK;
527   if( !pBt->pHasContent ){
528     assert( pgno<=pBt->nPage );
529     pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
530     if( !pBt->pHasContent ){
531       rc = SQLITE_NOMEM;
532     }
533   }
534   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
535     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
536   }
537   return rc;
538 }
539 
540 /*
541 ** Query the BtShared.pHasContent vector.
542 **
543 ** This function is called when a free-list leaf page is removed from the
544 ** free-list for reuse. It returns false if it is safe to retrieve the
545 ** page from the pager layer with the 'no-content' flag set. True otherwise.
546 */
btreeGetHasContent(BtShared * pBt,Pgno pgno)547 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
548   Bitvec *p = pBt->pHasContent;
549   return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
550 }
551 
552 /*
553 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
554 ** invoked at the conclusion of each write-transaction.
555 */
btreeClearHasContent(BtShared * pBt)556 static void btreeClearHasContent(BtShared *pBt){
557   sqlite3BitvecDestroy(pBt->pHasContent);
558   pBt->pHasContent = 0;
559 }
560 
561 /*
562 ** Save the current cursor position in the variables BtCursor.nKey
563 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
564 **
565 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
566 ** prior to calling this routine.
567 */
saveCursorPosition(BtCursor * pCur)568 static int saveCursorPosition(BtCursor *pCur){
569   int rc;
570 
571   assert( CURSOR_VALID==pCur->eState );
572   assert( 0==pCur->pKey );
573   assert( cursorHoldsMutex(pCur) );
574 
575   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
576   assert( rc==SQLITE_OK );  /* KeySize() cannot fail */
577 
578   /* If this is an intKey table, then the above call to BtreeKeySize()
579   ** stores the integer key in pCur->nKey. In this case this value is
580   ** all that is required. Otherwise, if pCur is not open on an intKey
581   ** table, then malloc space for and store the pCur->nKey bytes of key
582   ** data.
583   */
584   if( 0==pCur->apPage[0]->intKey ){
585     void *pKey = sqlite3Malloc( (int)pCur->nKey );
586     if( pKey ){
587       rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
588       if( rc==SQLITE_OK ){
589         pCur->pKey = pKey;
590       }else{
591         sqlite3_free(pKey);
592       }
593     }else{
594       rc = SQLITE_NOMEM;
595     }
596   }
597   assert( !pCur->apPage[0]->intKey || !pCur->pKey );
598 
599   if( rc==SQLITE_OK ){
600     int i;
601     for(i=0; i<=pCur->iPage; i++){
602       releasePage(pCur->apPage[i]);
603       pCur->apPage[i] = 0;
604     }
605     pCur->iPage = -1;
606     pCur->eState = CURSOR_REQUIRESEEK;
607   }
608 
609   invalidateOverflowCache(pCur);
610   return rc;
611 }
612 
613 /*
614 ** Save the positions of all cursors (except pExcept) that are open on
615 ** the table  with root-page iRoot. Usually, this is called just before cursor
616 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
617 */
saveAllCursors(BtShared * pBt,Pgno iRoot,BtCursor * pExcept)618 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
619   BtCursor *p;
620   assert( sqlite3_mutex_held(pBt->mutex) );
621   assert( pExcept==0 || pExcept->pBt==pBt );
622   for(p=pBt->pCursor; p; p=p->pNext){
623     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
624         p->eState==CURSOR_VALID ){
625       int rc = saveCursorPosition(p);
626       if( SQLITE_OK!=rc ){
627         return rc;
628       }
629     }
630   }
631   return SQLITE_OK;
632 }
633 
634 /*
635 ** Clear the current cursor position.
636 */
sqlite3BtreeClearCursor(BtCursor * pCur)637 void sqlite3BtreeClearCursor(BtCursor *pCur){
638   assert( cursorHoldsMutex(pCur) );
639   sqlite3_free(pCur->pKey);
640   pCur->pKey = 0;
641   pCur->eState = CURSOR_INVALID;
642 }
643 
644 /*
645 ** In this version of BtreeMoveto, pKey is a packed index record
646 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
647 ** record and then call BtreeMovetoUnpacked() to do the work.
648 */
btreeMoveto(BtCursor * pCur,const void * pKey,i64 nKey,int bias,int * pRes)649 static int btreeMoveto(
650   BtCursor *pCur,     /* Cursor open on the btree to be searched */
651   const void *pKey,   /* Packed key if the btree is an index */
652   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
653   int bias,           /* Bias search to the high end */
654   int *pRes           /* Write search results here */
655 ){
656   int rc;                    /* Status code */
657   UnpackedRecord *pIdxKey;   /* Unpacked index key */
658   char aSpace[150];          /* Temp space for pIdxKey - to avoid a malloc */
659 
660   if( pKey ){
661     assert( nKey==(i64)(int)nKey );
662     pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey,
663                                       aSpace, sizeof(aSpace));
664     if( pIdxKey==0 ) return SQLITE_NOMEM;
665   }else{
666     pIdxKey = 0;
667   }
668   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
669   if( pKey ){
670     sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
671   }
672   return rc;
673 }
674 
675 /*
676 ** Restore the cursor to the position it was in (or as close to as possible)
677 ** when saveCursorPosition() was called. Note that this call deletes the
678 ** saved position info stored by saveCursorPosition(), so there can be
679 ** at most one effective restoreCursorPosition() call after each
680 ** saveCursorPosition().
681 */
btreeRestoreCursorPosition(BtCursor * pCur)682 static int btreeRestoreCursorPosition(BtCursor *pCur){
683   int rc;
684   assert( cursorHoldsMutex(pCur) );
685   assert( pCur->eState>=CURSOR_REQUIRESEEK );
686   if( pCur->eState==CURSOR_FAULT ){
687     return pCur->skipNext;
688   }
689   pCur->eState = CURSOR_INVALID;
690   rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext);
691   if( rc==SQLITE_OK ){
692     sqlite3_free(pCur->pKey);
693     pCur->pKey = 0;
694     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
695   }
696   return rc;
697 }
698 
699 #define restoreCursorPosition(p) \
700   (p->eState>=CURSOR_REQUIRESEEK ? \
701          btreeRestoreCursorPosition(p) : \
702          SQLITE_OK)
703 
704 /*
705 ** Determine whether or not a cursor has moved from the position it
706 ** was last placed at.  Cursors can move when the row they are pointing
707 ** at is deleted out from under them.
708 **
709 ** This routine returns an error code if something goes wrong.  The
710 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
711 */
sqlite3BtreeCursorHasMoved(BtCursor * pCur,int * pHasMoved)712 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
713   int rc;
714 
715   rc = restoreCursorPosition(pCur);
716   if( rc ){
717     *pHasMoved = 1;
718     return rc;
719   }
720   if( pCur->eState!=CURSOR_VALID || pCur->skipNext!=0 ){
721     *pHasMoved = 1;
722   }else{
723     *pHasMoved = 0;
724   }
725   return SQLITE_OK;
726 }
727 
728 #ifndef SQLITE_OMIT_AUTOVACUUM
729 /*
730 ** Given a page number of a regular database page, return the page
731 ** number for the pointer-map page that contains the entry for the
732 ** input page number.
733 **
734 ** Return 0 (not a valid page) for pgno==1 since there is
735 ** no pointer map associated with page 1.  The integrity_check logic
736 ** requires that ptrmapPageno(*,1)!=1.
737 */
ptrmapPageno(BtShared * pBt,Pgno pgno)738 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
739   int nPagesPerMapPage;
740   Pgno iPtrMap, ret;
741   assert( sqlite3_mutex_held(pBt->mutex) );
742   if( pgno<2 ) return 0;
743   nPagesPerMapPage = (pBt->usableSize/5)+1;
744   iPtrMap = (pgno-2)/nPagesPerMapPage;
745   ret = (iPtrMap*nPagesPerMapPage) + 2;
746   if( ret==PENDING_BYTE_PAGE(pBt) ){
747     ret++;
748   }
749   return ret;
750 }
751 
752 /*
753 ** Write an entry into the pointer map.
754 **
755 ** This routine updates the pointer map entry for page number 'key'
756 ** so that it maps to type 'eType' and parent page number 'pgno'.
757 **
758 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
759 ** a no-op.  If an error occurs, the appropriate error code is written
760 ** into *pRC.
761 */
ptrmapPut(BtShared * pBt,Pgno key,u8 eType,Pgno parent,int * pRC)762 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
763   DbPage *pDbPage;  /* The pointer map page */
764   u8 *pPtrmap;      /* The pointer map data */
765   Pgno iPtrmap;     /* The pointer map page number */
766   int offset;       /* Offset in pointer map page */
767   int rc;           /* Return code from subfunctions */
768 
769   if( *pRC ) return;
770 
771   assert( sqlite3_mutex_held(pBt->mutex) );
772   /* The master-journal page number must never be used as a pointer map page */
773   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
774 
775   assert( pBt->autoVacuum );
776   if( key==0 ){
777     *pRC = SQLITE_CORRUPT_BKPT;
778     return;
779   }
780   iPtrmap = PTRMAP_PAGENO(pBt, key);
781   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
782   if( rc!=SQLITE_OK ){
783     *pRC = rc;
784     return;
785   }
786   offset = PTRMAP_PTROFFSET(iPtrmap, key);
787   if( offset<0 ){
788     *pRC = SQLITE_CORRUPT_BKPT;
789     goto ptrmap_exit;
790   }
791   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
792 
793   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
794     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
795     *pRC= rc = sqlite3PagerWrite(pDbPage);
796     if( rc==SQLITE_OK ){
797       pPtrmap[offset] = eType;
798       put4byte(&pPtrmap[offset+1], parent);
799     }
800   }
801 
802 ptrmap_exit:
803   sqlite3PagerUnref(pDbPage);
804 }
805 
806 /*
807 ** Read an entry from the pointer map.
808 **
809 ** This routine retrieves the pointer map entry for page 'key', writing
810 ** the type and parent page number to *pEType and *pPgno respectively.
811 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
812 */
ptrmapGet(BtShared * pBt,Pgno key,u8 * pEType,Pgno * pPgno)813 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
814   DbPage *pDbPage;   /* The pointer map page */
815   int iPtrmap;       /* Pointer map page index */
816   u8 *pPtrmap;       /* Pointer map page data */
817   int offset;        /* Offset of entry in pointer map */
818   int rc;
819 
820   assert( sqlite3_mutex_held(pBt->mutex) );
821 
822   iPtrmap = PTRMAP_PAGENO(pBt, key);
823   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
824   if( rc!=0 ){
825     return rc;
826   }
827   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
828 
829   offset = PTRMAP_PTROFFSET(iPtrmap, key);
830   assert( pEType!=0 );
831   *pEType = pPtrmap[offset];
832   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
833 
834   sqlite3PagerUnref(pDbPage);
835   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
836   return SQLITE_OK;
837 }
838 
839 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
840   #define ptrmapPut(w,x,y,z,rc)
841   #define ptrmapGet(w,x,y,z) SQLITE_OK
842   #define ptrmapPutOvflPtr(x, y, rc)
843 #endif
844 
845 /*
846 ** Given a btree page and a cell index (0 means the first cell on
847 ** the page, 1 means the second cell, and so forth) return a pointer
848 ** to the cell content.
849 **
850 ** This routine works only for pages that do not contain overflow cells.
851 */
852 #define findCell(P,I) \
853   ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
854 
855 /*
856 ** This a more complex version of findCell() that works for
857 ** pages that do contain overflow cells.
858 */
findOverflowCell(MemPage * pPage,int iCell)859 static u8 *findOverflowCell(MemPage *pPage, int iCell){
860   int i;
861   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
862   for(i=pPage->nOverflow-1; i>=0; i--){
863     int k;
864     struct _OvflCell *pOvfl;
865     pOvfl = &pPage->aOvfl[i];
866     k = pOvfl->idx;
867     if( k<=iCell ){
868       if( k==iCell ){
869         return pOvfl->pCell;
870       }
871       iCell--;
872     }
873   }
874   return findCell(pPage, iCell);
875 }
876 
877 /*
878 ** Parse a cell content block and fill in the CellInfo structure.  There
879 ** are two versions of this function.  btreeParseCell() takes a
880 ** cell index as the second argument and btreeParseCellPtr()
881 ** takes a pointer to the body of the cell as its second argument.
882 **
883 ** Within this file, the parseCell() macro can be called instead of
884 ** btreeParseCellPtr(). Using some compilers, this will be faster.
885 */
btreeParseCellPtr(MemPage * pPage,u8 * pCell,CellInfo * pInfo)886 static void btreeParseCellPtr(
887   MemPage *pPage,         /* Page containing the cell */
888   u8 *pCell,              /* Pointer to the cell text. */
889   CellInfo *pInfo         /* Fill in this structure */
890 ){
891   u16 n;                  /* Number bytes in cell content header */
892   u32 nPayload;           /* Number of bytes of cell payload */
893 
894   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
895 
896   pInfo->pCell = pCell;
897   assert( pPage->leaf==0 || pPage->leaf==1 );
898   n = pPage->childPtrSize;
899   assert( n==4-4*pPage->leaf );
900   if( pPage->intKey ){
901     if( pPage->hasData ){
902       n += getVarint32(&pCell[n], nPayload);
903     }else{
904       nPayload = 0;
905     }
906     n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
907     pInfo->nData = nPayload;
908   }else{
909     pInfo->nData = 0;
910     n += getVarint32(&pCell[n], nPayload);
911     pInfo->nKey = nPayload;
912   }
913   pInfo->nPayload = nPayload;
914   pInfo->nHeader = n;
915   testcase( nPayload==pPage->maxLocal );
916   testcase( nPayload==pPage->maxLocal+1 );
917   if( likely(nPayload<=pPage->maxLocal) ){
918     /* This is the (easy) common case where the entire payload fits
919     ** on the local page.  No overflow is required.
920     */
921     if( (pInfo->nSize = (u16)(n+nPayload))<4 ) pInfo->nSize = 4;
922     pInfo->nLocal = (u16)nPayload;
923     pInfo->iOverflow = 0;
924   }else{
925     /* If the payload will not fit completely on the local page, we have
926     ** to decide how much to store locally and how much to spill onto
927     ** overflow pages.  The strategy is to minimize the amount of unused
928     ** space on overflow pages while keeping the amount of local storage
929     ** in between minLocal and maxLocal.
930     **
931     ** Warning:  changing the way overflow payload is distributed in any
932     ** way will result in an incompatible file format.
933     */
934     int minLocal;  /* Minimum amount of payload held locally */
935     int maxLocal;  /* Maximum amount of payload held locally */
936     int surplus;   /* Overflow payload available for local storage */
937 
938     minLocal = pPage->minLocal;
939     maxLocal = pPage->maxLocal;
940     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
941     testcase( surplus==maxLocal );
942     testcase( surplus==maxLocal+1 );
943     if( surplus <= maxLocal ){
944       pInfo->nLocal = (u16)surplus;
945     }else{
946       pInfo->nLocal = (u16)minLocal;
947     }
948     pInfo->iOverflow = (u16)(pInfo->nLocal + n);
949     pInfo->nSize = pInfo->iOverflow + 4;
950   }
951 }
952 #define parseCell(pPage, iCell, pInfo) \
953   btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
btreeParseCell(MemPage * pPage,int iCell,CellInfo * pInfo)954 static void btreeParseCell(
955   MemPage *pPage,         /* Page containing the cell */
956   int iCell,              /* The cell index.  First cell is 0 */
957   CellInfo *pInfo         /* Fill in this structure */
958 ){
959   parseCell(pPage, iCell, pInfo);
960 }
961 
962 /*
963 ** Compute the total number of bytes that a Cell needs in the cell
964 ** data area of the btree-page.  The return number includes the cell
965 ** data header and the local payload, but not any overflow page or
966 ** the space used by the cell pointer.
967 */
cellSizePtr(MemPage * pPage,u8 * pCell)968 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
969   u8 *pIter = &pCell[pPage->childPtrSize];
970   u32 nSize;
971 
972 #ifdef SQLITE_DEBUG
973   /* The value returned by this function should always be the same as
974   ** the (CellInfo.nSize) value found by doing a full parse of the
975   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
976   ** this function verifies that this invariant is not violated. */
977   CellInfo debuginfo;
978   btreeParseCellPtr(pPage, pCell, &debuginfo);
979 #endif
980 
981   if( pPage->intKey ){
982     u8 *pEnd;
983     if( pPage->hasData ){
984       pIter += getVarint32(pIter, nSize);
985     }else{
986       nSize = 0;
987     }
988 
989     /* pIter now points at the 64-bit integer key value, a variable length
990     ** integer. The following block moves pIter to point at the first byte
991     ** past the end of the key value. */
992     pEnd = &pIter[9];
993     while( (*pIter++)&0x80 && pIter<pEnd );
994   }else{
995     pIter += getVarint32(pIter, nSize);
996   }
997 
998   testcase( nSize==pPage->maxLocal );
999   testcase( nSize==pPage->maxLocal+1 );
1000   if( nSize>pPage->maxLocal ){
1001     int minLocal = pPage->minLocal;
1002     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1003     testcase( nSize==pPage->maxLocal );
1004     testcase( nSize==pPage->maxLocal+1 );
1005     if( nSize>pPage->maxLocal ){
1006       nSize = minLocal;
1007     }
1008     nSize += 4;
1009   }
1010   nSize += (u32)(pIter - pCell);
1011 
1012   /* The minimum size of any cell is 4 bytes. */
1013   if( nSize<4 ){
1014     nSize = 4;
1015   }
1016 
1017   assert( nSize==debuginfo.nSize );
1018   return (u16)nSize;
1019 }
1020 
1021 #ifdef SQLITE_DEBUG
1022 /* This variation on cellSizePtr() is used inside of assert() statements
1023 ** only. */
cellSize(MemPage * pPage,int iCell)1024 static u16 cellSize(MemPage *pPage, int iCell){
1025   return cellSizePtr(pPage, findCell(pPage, iCell));
1026 }
1027 #endif
1028 
1029 #ifndef SQLITE_OMIT_AUTOVACUUM
1030 /*
1031 ** If the cell pCell, part of page pPage contains a pointer
1032 ** to an overflow page, insert an entry into the pointer-map
1033 ** for the overflow page.
1034 */
ptrmapPutOvflPtr(MemPage * pPage,u8 * pCell,int * pRC)1035 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
1036   CellInfo info;
1037   if( *pRC ) return;
1038   assert( pCell!=0 );
1039   btreeParseCellPtr(pPage, pCell, &info);
1040   assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
1041   if( info.iOverflow ){
1042     Pgno ovfl = get4byte(&pCell[info.iOverflow]);
1043     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1044   }
1045 }
1046 #endif
1047 
1048 
1049 /*
1050 ** Defragment the page given.  All Cells are moved to the
1051 ** end of the page and all free space is collected into one
1052 ** big FreeBlk that occurs in between the header and cell
1053 ** pointer array and the cell content area.
1054 */
defragmentPage(MemPage * pPage)1055 static int defragmentPage(MemPage *pPage){
1056   int i;                     /* Loop counter */
1057   int pc;                    /* Address of a i-th cell */
1058   int hdr;                   /* Offset to the page header */
1059   int size;                  /* Size of a cell */
1060   int usableSize;            /* Number of usable bytes on a page */
1061   int cellOffset;            /* Offset to the cell pointer array */
1062   int cbrk;                  /* Offset to the cell content area */
1063   int nCell;                 /* Number of cells on the page */
1064   unsigned char *data;       /* The page data */
1065   unsigned char *temp;       /* Temp area for cell content */
1066   int iCellFirst;            /* First allowable cell index */
1067   int iCellLast;             /* Last possible cell index */
1068 
1069 
1070   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1071   assert( pPage->pBt!=0 );
1072   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1073   assert( pPage->nOverflow==0 );
1074   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1075   temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1076   data = pPage->aData;
1077   hdr = pPage->hdrOffset;
1078   cellOffset = pPage->cellOffset;
1079   nCell = pPage->nCell;
1080   assert( nCell==get2byte(&data[hdr+3]) );
1081   usableSize = pPage->pBt->usableSize;
1082   cbrk = get2byte(&data[hdr+5]);
1083   memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
1084   cbrk = usableSize;
1085   iCellFirst = cellOffset + 2*nCell;
1086   iCellLast = usableSize - 4;
1087   for(i=0; i<nCell; i++){
1088     u8 *pAddr;     /* The i-th cell pointer */
1089     pAddr = &data[cellOffset + i*2];
1090     pc = get2byte(pAddr);
1091     testcase( pc==iCellFirst );
1092     testcase( pc==iCellLast );
1093 #if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1094     /* These conditions have already been verified in btreeInitPage()
1095     ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined
1096     */
1097     if( pc<iCellFirst || pc>iCellLast ){
1098       return SQLITE_CORRUPT_BKPT;
1099     }
1100 #endif
1101     assert( pc>=iCellFirst && pc<=iCellLast );
1102     size = cellSizePtr(pPage, &temp[pc]);
1103     cbrk -= size;
1104 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1105     if( cbrk<iCellFirst ){
1106       return SQLITE_CORRUPT_BKPT;
1107     }
1108 #else
1109     if( cbrk<iCellFirst || pc+size>usableSize ){
1110       return SQLITE_CORRUPT_BKPT;
1111     }
1112 #endif
1113     assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1114     testcase( cbrk+size==usableSize );
1115     testcase( pc+size==usableSize );
1116     memcpy(&data[cbrk], &temp[pc], size);
1117     put2byte(pAddr, cbrk);
1118   }
1119   assert( cbrk>=iCellFirst );
1120   put2byte(&data[hdr+5], cbrk);
1121   data[hdr+1] = 0;
1122   data[hdr+2] = 0;
1123   data[hdr+7] = 0;
1124   memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1125   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1126   if( cbrk-iCellFirst!=pPage->nFree ){
1127     return SQLITE_CORRUPT_BKPT;
1128   }
1129   return SQLITE_OK;
1130 }
1131 
1132 /*
1133 ** Allocate nByte bytes of space from within the B-Tree page passed
1134 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1135 ** of the first byte of allocated space. Return either SQLITE_OK or
1136 ** an error code (usually SQLITE_CORRUPT).
1137 **
1138 ** The caller guarantees that there is sufficient space to make the
1139 ** allocation.  This routine might need to defragment in order to bring
1140 ** all the space together, however.  This routine will avoid using
1141 ** the first two bytes past the cell pointer area since presumably this
1142 ** allocation is being made in order to insert a new cell, so we will
1143 ** also end up needing a new cell pointer.
1144 */
allocateSpace(MemPage * pPage,int nByte,int * pIdx)1145 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1146   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
1147   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
1148   int nFrag;                           /* Number of fragmented bytes on pPage */
1149   int top;                             /* First byte of cell content area */
1150   int gap;        /* First byte of gap between cell pointers and cell content */
1151   int rc;         /* Integer return code */
1152   int usableSize; /* Usable size of the page */
1153 
1154   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1155   assert( pPage->pBt );
1156   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1157   assert( nByte>=0 );  /* Minimum cell size is 4 */
1158   assert( pPage->nFree>=nByte );
1159   assert( pPage->nOverflow==0 );
1160   usableSize = pPage->pBt->usableSize;
1161   assert( nByte < usableSize-8 );
1162 
1163   nFrag = data[hdr+7];
1164   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1165   gap = pPage->cellOffset + 2*pPage->nCell;
1166   top = get2byteNotZero(&data[hdr+5]);
1167   if( gap>top ) return SQLITE_CORRUPT_BKPT;
1168   testcase( gap+2==top );
1169   testcase( gap+1==top );
1170   testcase( gap==top );
1171 
1172   if( nFrag>=60 ){
1173     /* Always defragment highly fragmented pages */
1174     rc = defragmentPage(pPage);
1175     if( rc ) return rc;
1176     top = get2byteNotZero(&data[hdr+5]);
1177   }else if( gap+2<=top ){
1178     /* Search the freelist looking for a free slot big enough to satisfy
1179     ** the request. The allocation is made from the first free slot in
1180     ** the list that is large enough to accomadate it.
1181     */
1182     int pc, addr;
1183     for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){
1184       int size;            /* Size of the free slot */
1185       if( pc>usableSize-4 || pc<addr+4 ){
1186         return SQLITE_CORRUPT_BKPT;
1187       }
1188       size = get2byte(&data[pc+2]);
1189       if( size>=nByte ){
1190         int x = size - nByte;
1191         testcase( x==4 );
1192         testcase( x==3 );
1193         if( x<4 ){
1194           /* Remove the slot from the free-list. Update the number of
1195           ** fragmented bytes within the page. */
1196           memcpy(&data[addr], &data[pc], 2);
1197           data[hdr+7] = (u8)(nFrag + x);
1198         }else if( size+pc > usableSize ){
1199           return SQLITE_CORRUPT_BKPT;
1200         }else{
1201           /* The slot remains on the free-list. Reduce its size to account
1202           ** for the portion used by the new allocation. */
1203           put2byte(&data[pc+2], x);
1204         }
1205         *pIdx = pc + x;
1206         return SQLITE_OK;
1207       }
1208     }
1209   }
1210 
1211   /* Check to make sure there is enough space in the gap to satisfy
1212   ** the allocation.  If not, defragment.
1213   */
1214   testcase( gap+2+nByte==top );
1215   if( gap+2+nByte>top ){
1216     rc = defragmentPage(pPage);
1217     if( rc ) return rc;
1218     top = get2byteNotZero(&data[hdr+5]);
1219     assert( gap+nByte<=top );
1220   }
1221 
1222 
1223   /* Allocate memory from the gap in between the cell pointer array
1224   ** and the cell content area.  The btreeInitPage() call has already
1225   ** validated the freelist.  Given that the freelist is valid, there
1226   ** is no way that the allocation can extend off the end of the page.
1227   ** The assert() below verifies the previous sentence.
1228   */
1229   top -= nByte;
1230   put2byte(&data[hdr+5], top);
1231   assert( top+nByte <= (int)pPage->pBt->usableSize );
1232   *pIdx = top;
1233   return SQLITE_OK;
1234 }
1235 
1236 /*
1237 ** Return a section of the pPage->aData to the freelist.
1238 ** The first byte of the new free block is pPage->aDisk[start]
1239 ** and the size of the block is "size" bytes.
1240 **
1241 ** Most of the effort here is involved in coalesing adjacent
1242 ** free blocks into a single big free block.
1243 */
freeSpace(MemPage * pPage,int start,int size)1244 static int freeSpace(MemPage *pPage, int start, int size){
1245   int addr, pbegin, hdr;
1246   int iLast;                        /* Largest possible freeblock offset */
1247   unsigned char *data = pPage->aData;
1248 
1249   assert( pPage->pBt!=0 );
1250   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1251   assert( start>=pPage->hdrOffset+6+pPage->childPtrSize );
1252   assert( (start + size) <= (int)pPage->pBt->usableSize );
1253   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1254   assert( size>=0 );   /* Minimum cell size is 4 */
1255 
1256   if( pPage->pBt->secureDelete ){
1257     /* Overwrite deleted information with zeros when the secure_delete
1258     ** option is enabled */
1259     memset(&data[start], 0, size);
1260   }
1261 
1262   /* Add the space back into the linked list of freeblocks.  Note that
1263   ** even though the freeblock list was checked by btreeInitPage(),
1264   ** btreeInitPage() did not detect overlapping cells or
1265   ** freeblocks that overlapped cells.   Nor does it detect when the
1266   ** cell content area exceeds the value in the page header.  If these
1267   ** situations arise, then subsequent insert operations might corrupt
1268   ** the freelist.  So we do need to check for corruption while scanning
1269   ** the freelist.
1270   */
1271   hdr = pPage->hdrOffset;
1272   addr = hdr + 1;
1273   iLast = pPage->pBt->usableSize - 4;
1274   assert( start<=iLast );
1275   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
1276     if( pbegin<addr+4 ){
1277       return SQLITE_CORRUPT_BKPT;
1278     }
1279     addr = pbegin;
1280   }
1281   if( pbegin>iLast ){
1282     return SQLITE_CORRUPT_BKPT;
1283   }
1284   assert( pbegin>addr || pbegin==0 );
1285   put2byte(&data[addr], start);
1286   put2byte(&data[start], pbegin);
1287   put2byte(&data[start+2], size);
1288   pPage->nFree = pPage->nFree + (u16)size;
1289 
1290   /* Coalesce adjacent free blocks */
1291   addr = hdr + 1;
1292   while( (pbegin = get2byte(&data[addr]))>0 ){
1293     int pnext, psize, x;
1294     assert( pbegin>addr );
1295     assert( pbegin <= (int)pPage->pBt->usableSize-4 );
1296     pnext = get2byte(&data[pbegin]);
1297     psize = get2byte(&data[pbegin+2]);
1298     if( pbegin + psize + 3 >= pnext && pnext>0 ){
1299       int frag = pnext - (pbegin+psize);
1300       if( (frag<0) || (frag>(int)data[hdr+7]) ){
1301         return SQLITE_CORRUPT_BKPT;
1302       }
1303       data[hdr+7] -= (u8)frag;
1304       x = get2byte(&data[pnext]);
1305       put2byte(&data[pbegin], x);
1306       x = pnext + get2byte(&data[pnext+2]) - pbegin;
1307       put2byte(&data[pbegin+2], x);
1308     }else{
1309       addr = pbegin;
1310     }
1311   }
1312 
1313   /* If the cell content area begins with a freeblock, remove it. */
1314   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
1315     int top;
1316     pbegin = get2byte(&data[hdr+1]);
1317     memcpy(&data[hdr+1], &data[pbegin], 2);
1318     top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]);
1319     put2byte(&data[hdr+5], top);
1320   }
1321   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1322   return SQLITE_OK;
1323 }
1324 
1325 /*
1326 ** Decode the flags byte (the first byte of the header) for a page
1327 ** and initialize fields of the MemPage structure accordingly.
1328 **
1329 ** Only the following combinations are supported.  Anything different
1330 ** indicates a corrupt database files:
1331 **
1332 **         PTF_ZERODATA
1333 **         PTF_ZERODATA | PTF_LEAF
1334 **         PTF_LEAFDATA | PTF_INTKEY
1335 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1336 */
decodeFlags(MemPage * pPage,int flagByte)1337 static int decodeFlags(MemPage *pPage, int flagByte){
1338   BtShared *pBt;     /* A copy of pPage->pBt */
1339 
1340   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1341   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1342   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1343   flagByte &= ~PTF_LEAF;
1344   pPage->childPtrSize = 4-4*pPage->leaf;
1345   pBt = pPage->pBt;
1346   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1347     pPage->intKey = 1;
1348     pPage->hasData = pPage->leaf;
1349     pPage->maxLocal = pBt->maxLeaf;
1350     pPage->minLocal = pBt->minLeaf;
1351   }else if( flagByte==PTF_ZERODATA ){
1352     pPage->intKey = 0;
1353     pPage->hasData = 0;
1354     pPage->maxLocal = pBt->maxLocal;
1355     pPage->minLocal = pBt->minLocal;
1356   }else{
1357     return SQLITE_CORRUPT_BKPT;
1358   }
1359   return SQLITE_OK;
1360 }
1361 
1362 /*
1363 ** Initialize the auxiliary information for a disk block.
1364 **
1365 ** Return SQLITE_OK on success.  If we see that the page does
1366 ** not contain a well-formed database page, then return
1367 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1368 ** guarantee that the page is well-formed.  It only shows that
1369 ** we failed to detect any corruption.
1370 */
btreeInitPage(MemPage * pPage)1371 static int btreeInitPage(MemPage *pPage){
1372 
1373   assert( pPage->pBt!=0 );
1374   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1375   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1376   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1377   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1378 
1379   if( !pPage->isInit ){
1380     u16 pc;            /* Address of a freeblock within pPage->aData[] */
1381     u8 hdr;            /* Offset to beginning of page header */
1382     u8 *data;          /* Equal to pPage->aData */
1383     BtShared *pBt;        /* The main btree structure */
1384     int usableSize;    /* Amount of usable space on each page */
1385     u16 cellOffset;    /* Offset from start of page to first cell pointer */
1386     int nFree;         /* Number of unused bytes on the page */
1387     int top;           /* First byte of the cell content area */
1388     int iCellFirst;    /* First allowable cell or freeblock offset */
1389     int iCellLast;     /* Last possible cell or freeblock offset */
1390 
1391     pBt = pPage->pBt;
1392 
1393     hdr = pPage->hdrOffset;
1394     data = pPage->aData;
1395     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1396     assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1397     pPage->maskPage = (u16)(pBt->pageSize - 1);
1398     pPage->nOverflow = 0;
1399     usableSize = pBt->usableSize;
1400     pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1401     top = get2byteNotZero(&data[hdr+5]);
1402     pPage->nCell = get2byte(&data[hdr+3]);
1403     if( pPage->nCell>MX_CELL(pBt) ){
1404       /* To many cells for a single page.  The page must be corrupt */
1405       return SQLITE_CORRUPT_BKPT;
1406     }
1407     testcase( pPage->nCell==MX_CELL(pBt) );
1408 
1409     /* A malformed database page might cause us to read past the end
1410     ** of page when parsing a cell.
1411     **
1412     ** The following block of code checks early to see if a cell extends
1413     ** past the end of a page boundary and causes SQLITE_CORRUPT to be
1414     ** returned if it does.
1415     */
1416     iCellFirst = cellOffset + 2*pPage->nCell;
1417     iCellLast = usableSize - 4;
1418 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1419     {
1420       int i;            /* Index into the cell pointer array */
1421       int sz;           /* Size of a cell */
1422 
1423       if( !pPage->leaf ) iCellLast--;
1424       for(i=0; i<pPage->nCell; i++){
1425         pc = get2byte(&data[cellOffset+i*2]);
1426         testcase( pc==iCellFirst );
1427         testcase( pc==iCellLast );
1428         if( pc<iCellFirst || pc>iCellLast ){
1429           return SQLITE_CORRUPT_BKPT;
1430         }
1431         sz = cellSizePtr(pPage, &data[pc]);
1432         testcase( pc+sz==usableSize );
1433         if( pc+sz>usableSize ){
1434           return SQLITE_CORRUPT_BKPT;
1435         }
1436       }
1437       if( !pPage->leaf ) iCellLast++;
1438     }
1439 #endif
1440 
1441     /* Compute the total free space on the page */
1442     pc = get2byte(&data[hdr+1]);
1443     nFree = data[hdr+7] + top;
1444     while( pc>0 ){
1445       u16 next, size;
1446       if( pc<iCellFirst || pc>iCellLast ){
1447         /* Start of free block is off the page */
1448         return SQLITE_CORRUPT_BKPT;
1449       }
1450       next = get2byte(&data[pc]);
1451       size = get2byte(&data[pc+2]);
1452       if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){
1453         /* Free blocks must be in ascending order. And the last byte of
1454 	** the free-block must lie on the database page.  */
1455         return SQLITE_CORRUPT_BKPT;
1456       }
1457       nFree = nFree + size;
1458       pc = next;
1459     }
1460 
1461     /* At this point, nFree contains the sum of the offset to the start
1462     ** of the cell-content area plus the number of free bytes within
1463     ** the cell-content area. If this is greater than the usable-size
1464     ** of the page, then the page must be corrupted. This check also
1465     ** serves to verify that the offset to the start of the cell-content
1466     ** area, according to the page header, lies within the page.
1467     */
1468     if( nFree>usableSize ){
1469       return SQLITE_CORRUPT_BKPT;
1470     }
1471     pPage->nFree = (u16)(nFree - iCellFirst);
1472     pPage->isInit = 1;
1473   }
1474   return SQLITE_OK;
1475 }
1476 
1477 /*
1478 ** Set up a raw page so that it looks like a database page holding
1479 ** no entries.
1480 */
zeroPage(MemPage * pPage,int flags)1481 static void zeroPage(MemPage *pPage, int flags){
1482   unsigned char *data = pPage->aData;
1483   BtShared *pBt = pPage->pBt;
1484   u8 hdr = pPage->hdrOffset;
1485   u16 first;
1486 
1487   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1488   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1489   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1490   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1491   assert( sqlite3_mutex_held(pBt->mutex) );
1492   if( pBt->secureDelete ){
1493     memset(&data[hdr], 0, pBt->usableSize - hdr);
1494   }
1495   data[hdr] = (char)flags;
1496   first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0);
1497   memset(&data[hdr+1], 0, 4);
1498   data[hdr+7] = 0;
1499   put2byte(&data[hdr+5], pBt->usableSize);
1500   pPage->nFree = (u16)(pBt->usableSize - first);
1501   decodeFlags(pPage, flags);
1502   pPage->hdrOffset = hdr;
1503   pPage->cellOffset = first;
1504   pPage->nOverflow = 0;
1505   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1506   pPage->maskPage = (u16)(pBt->pageSize - 1);
1507   pPage->nCell = 0;
1508   pPage->isInit = 1;
1509 }
1510 
1511 
1512 /*
1513 ** Convert a DbPage obtained from the pager into a MemPage used by
1514 ** the btree layer.
1515 */
btreePageFromDbPage(DbPage * pDbPage,Pgno pgno,BtShared * pBt)1516 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1517   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1518   pPage->aData = sqlite3PagerGetData(pDbPage);
1519   pPage->pDbPage = pDbPage;
1520   pPage->pBt = pBt;
1521   pPage->pgno = pgno;
1522   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1523   return pPage;
1524 }
1525 
1526 /*
1527 ** Get a page from the pager.  Initialize the MemPage.pBt and
1528 ** MemPage.aData elements if needed.
1529 **
1530 ** If the noContent flag is set, it means that we do not care about
1531 ** the content of the page at this time.  So do not go to the disk
1532 ** to fetch the content.  Just fill in the content with zeros for now.
1533 ** If in the future we call sqlite3PagerWrite() on this page, that
1534 ** means we have started to be concerned about content and the disk
1535 ** read should occur at that point.
1536 */
btreeGetPage(BtShared * pBt,Pgno pgno,MemPage ** ppPage,int noContent)1537 static int btreeGetPage(
1538   BtShared *pBt,       /* The btree */
1539   Pgno pgno,           /* Number of the page to fetch */
1540   MemPage **ppPage,    /* Return the page in this parameter */
1541   int noContent        /* Do not load page content if true */
1542 ){
1543   int rc;
1544   DbPage *pDbPage;
1545 
1546   assert( sqlite3_mutex_held(pBt->mutex) );
1547   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1548   if( rc ) return rc;
1549   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1550   return SQLITE_OK;
1551 }
1552 
1553 /*
1554 ** Retrieve a page from the pager cache. If the requested page is not
1555 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
1556 ** MemPage.aData elements if needed.
1557 */
btreePageLookup(BtShared * pBt,Pgno pgno)1558 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
1559   DbPage *pDbPage;
1560   assert( sqlite3_mutex_held(pBt->mutex) );
1561   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1562   if( pDbPage ){
1563     return btreePageFromDbPage(pDbPage, pgno, pBt);
1564   }
1565   return 0;
1566 }
1567 
1568 /*
1569 ** Return the size of the database file in pages. If there is any kind of
1570 ** error, return ((unsigned int)-1).
1571 */
btreePagecount(BtShared * pBt)1572 static Pgno btreePagecount(BtShared *pBt){
1573   return pBt->nPage;
1574 }
sqlite3BtreeLastPage(Btree * p)1575 u32 sqlite3BtreeLastPage(Btree *p){
1576   assert( sqlite3BtreeHoldsMutex(p) );
1577   assert( ((p->pBt->nPage)&0x8000000)==0 );
1578   return (int)btreePagecount(p->pBt);
1579 }
1580 
1581 /*
1582 ** Get a page from the pager and initialize it.  This routine is just a
1583 ** convenience wrapper around separate calls to btreeGetPage() and
1584 ** btreeInitPage().
1585 **
1586 ** If an error occurs, then the value *ppPage is set to is undefined. It
1587 ** may remain unchanged, or it may be set to an invalid value.
1588 */
getAndInitPage(BtShared * pBt,Pgno pgno,MemPage ** ppPage)1589 static int getAndInitPage(
1590   BtShared *pBt,          /* The database file */
1591   Pgno pgno,           /* Number of the page to get */
1592   MemPage **ppPage     /* Write the page pointer here */
1593 ){
1594   int rc;
1595   assert( sqlite3_mutex_held(pBt->mutex) );
1596 
1597   if( pgno>btreePagecount(pBt) ){
1598     rc = SQLITE_CORRUPT_BKPT;
1599   }else{
1600     rc = btreeGetPage(pBt, pgno, ppPage, 0);
1601     if( rc==SQLITE_OK ){
1602       rc = btreeInitPage(*ppPage);
1603       if( rc!=SQLITE_OK ){
1604         releasePage(*ppPage);
1605       }
1606     }
1607   }
1608 
1609   testcase( pgno==0 );
1610   assert( pgno!=0 || rc==SQLITE_CORRUPT );
1611   return rc;
1612 }
1613 
1614 /*
1615 ** Release a MemPage.  This should be called once for each prior
1616 ** call to btreeGetPage.
1617 */
releasePage(MemPage * pPage)1618 static void releasePage(MemPage *pPage){
1619   if( pPage ){
1620     assert( pPage->aData );
1621     assert( pPage->pBt );
1622     assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1623     assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1624     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1625     sqlite3PagerUnref(pPage->pDbPage);
1626   }
1627 }
1628 
1629 /*
1630 ** During a rollback, when the pager reloads information into the cache
1631 ** so that the cache is restored to its original state at the start of
1632 ** the transaction, for each page restored this routine is called.
1633 **
1634 ** This routine needs to reset the extra data section at the end of the
1635 ** page to agree with the restored data.
1636 */
pageReinit(DbPage * pData)1637 static void pageReinit(DbPage *pData){
1638   MemPage *pPage;
1639   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1640   assert( sqlite3PagerPageRefcount(pData)>0 );
1641   if( pPage->isInit ){
1642     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1643     pPage->isInit = 0;
1644     if( sqlite3PagerPageRefcount(pData)>1 ){
1645       /* pPage might not be a btree page;  it might be an overflow page
1646       ** or ptrmap page or a free page.  In those cases, the following
1647       ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
1648       ** But no harm is done by this.  And it is very important that
1649       ** btreeInitPage() be called on every btree page so we make
1650       ** the call for every page that comes in for re-initing. */
1651       btreeInitPage(pPage);
1652     }
1653   }
1654 }
1655 
1656 /*
1657 ** Invoke the busy handler for a btree.
1658 */
btreeInvokeBusyHandler(void * pArg)1659 static int btreeInvokeBusyHandler(void *pArg){
1660   BtShared *pBt = (BtShared*)pArg;
1661   assert( pBt->db );
1662   assert( sqlite3_mutex_held(pBt->db->mutex) );
1663   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1664 }
1665 
1666 /*
1667 ** Open a database file.
1668 **
1669 ** zFilename is the name of the database file.  If zFilename is NULL
1670 ** then an ephemeral database is created.  The ephemeral database might
1671 ** be exclusively in memory, or it might use a disk-based memory cache.
1672 ** Either way, the ephemeral database will be automatically deleted
1673 ** when sqlite3BtreeClose() is called.
1674 **
1675 ** If zFilename is ":memory:" then an in-memory database is created
1676 ** that is automatically destroyed when it is closed.
1677 **
1678 ** The "flags" parameter is a bitmask that might contain bits
1679 ** BTREE_OMIT_JOURNAL and/or BTREE_NO_READLOCK.  The BTREE_NO_READLOCK
1680 ** bit is also set if the SQLITE_NoReadlock flags is set in db->flags.
1681 ** These flags are passed through into sqlite3PagerOpen() and must
1682 ** be the same values as PAGER_OMIT_JOURNAL and PAGER_NO_READLOCK.
1683 **
1684 ** If the database is already opened in the same database connection
1685 ** and we are in shared cache mode, then the open will fail with an
1686 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
1687 ** objects in the same database connection since doing so will lead
1688 ** to problems with locking.
1689 */
sqlite3BtreeOpen(const char * zFilename,sqlite3 * db,Btree ** ppBtree,int flags,int vfsFlags)1690 int sqlite3BtreeOpen(
1691   const char *zFilename,  /* Name of the file containing the BTree database */
1692   sqlite3 *db,            /* Associated database handle */
1693   Btree **ppBtree,        /* Pointer to new Btree object written here */
1694   int flags,              /* Options */
1695   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
1696 ){
1697   sqlite3_vfs *pVfs;             /* The VFS to use for this btree */
1698   BtShared *pBt = 0;             /* Shared part of btree structure */
1699   Btree *p;                      /* Handle to return */
1700   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
1701   int rc = SQLITE_OK;            /* Result code from this function */
1702   u8 nReserve;                   /* Byte of unused space on each page */
1703   unsigned char zDbHeader[100];  /* Database header content */
1704 
1705   /* True if opening an ephemeral, temporary database */
1706   const int isTempDb = zFilename==0 || zFilename[0]==0;
1707 
1708   /* Set the variable isMemdb to true for an in-memory database, or
1709   ** false for a file-based database.
1710   */
1711 #ifdef SQLITE_OMIT_MEMORYDB
1712   const int isMemdb = 0;
1713 #else
1714   const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
1715                        || (isTempDb && sqlite3TempInMemory(db));
1716 #endif
1717 
1718   assert( db!=0 );
1719   assert( sqlite3_mutex_held(db->mutex) );
1720   assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
1721 
1722   /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
1723   assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
1724 
1725   /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
1726   assert( (flags & BTREE_SINGLE)==0 || isTempDb );
1727 
1728   if( db->flags & SQLITE_NoReadlock ){
1729     flags |= BTREE_NO_READLOCK;
1730   }
1731   if( isMemdb ){
1732     flags |= BTREE_MEMORY;
1733   }
1734   if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
1735     vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
1736   }
1737   pVfs = db->pVfs;
1738   p = sqlite3MallocZero(sizeof(Btree));
1739   if( !p ){
1740     return SQLITE_NOMEM;
1741   }
1742   p->inTrans = TRANS_NONE;
1743   p->db = db;
1744 #ifndef SQLITE_OMIT_SHARED_CACHE
1745   p->lock.pBtree = p;
1746   p->lock.iTable = 1;
1747 #endif
1748 
1749 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1750   /*
1751   ** If this Btree is a candidate for shared cache, try to find an
1752   ** existing BtShared object that we can share with
1753   */
1754   if( isMemdb==0 && isTempDb==0 ){
1755     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
1756       int nFullPathname = pVfs->mxPathname+1;
1757       char *zFullPathname = sqlite3Malloc(nFullPathname);
1758       sqlite3_mutex *mutexShared;
1759       p->sharable = 1;
1760       if( !zFullPathname ){
1761         sqlite3_free(p);
1762         return SQLITE_NOMEM;
1763       }
1764       sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
1765       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1766       sqlite3_mutex_enter(mutexOpen);
1767       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1768       sqlite3_mutex_enter(mutexShared);
1769       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
1770         assert( pBt->nRef>0 );
1771         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1772                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1773           int iDb;
1774           for(iDb=db->nDb-1; iDb>=0; iDb--){
1775             Btree *pExisting = db->aDb[iDb].pBt;
1776             if( pExisting && pExisting->pBt==pBt ){
1777               sqlite3_mutex_leave(mutexShared);
1778               sqlite3_mutex_leave(mutexOpen);
1779               sqlite3_free(zFullPathname);
1780               sqlite3_free(p);
1781               return SQLITE_CONSTRAINT;
1782             }
1783           }
1784           p->pBt = pBt;
1785           pBt->nRef++;
1786           break;
1787         }
1788       }
1789       sqlite3_mutex_leave(mutexShared);
1790       sqlite3_free(zFullPathname);
1791     }
1792 #ifdef SQLITE_DEBUG
1793     else{
1794       /* In debug mode, we mark all persistent databases as sharable
1795       ** even when they are not.  This exercises the locking code and
1796       ** gives more opportunity for asserts(sqlite3_mutex_held())
1797       ** statements to find locking problems.
1798       */
1799       p->sharable = 1;
1800     }
1801 #endif
1802   }
1803 #endif
1804   if( pBt==0 ){
1805     /*
1806     ** The following asserts make sure that structures used by the btree are
1807     ** the right size.  This is to guard against size changes that result
1808     ** when compiling on a different architecture.
1809     */
1810     assert( sizeof(i64)==8 || sizeof(i64)==4 );
1811     assert( sizeof(u64)==8 || sizeof(u64)==4 );
1812     assert( sizeof(u32)==4 );
1813     assert( sizeof(u16)==2 );
1814     assert( sizeof(Pgno)==4 );
1815 
1816     pBt = sqlite3MallocZero( sizeof(*pBt) );
1817     if( pBt==0 ){
1818       rc = SQLITE_NOMEM;
1819       goto btree_open_out;
1820     }
1821     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1822                           EXTRA_SIZE, flags, vfsFlags, pageReinit);
1823     if( rc==SQLITE_OK ){
1824       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1825     }
1826     if( rc!=SQLITE_OK ){
1827       goto btree_open_out;
1828     }
1829     pBt->openFlags = (u8)flags;
1830     pBt->db = db;
1831     sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
1832     p->pBt = pBt;
1833 
1834     pBt->pCursor = 0;
1835     pBt->pPage1 = 0;
1836     pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1837 #ifdef SQLITE_SECURE_DELETE
1838     pBt->secureDelete = 1;
1839 #endif
1840     pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
1841     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1842          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1843       pBt->pageSize = 0;
1844 #ifndef SQLITE_OMIT_AUTOVACUUM
1845       /* If the magic name ":memory:" will create an in-memory database, then
1846       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1847       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1848       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1849       ** regular file-name. In this case the auto-vacuum applies as per normal.
1850       */
1851       if( zFilename && !isMemdb ){
1852         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1853         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1854       }
1855 #endif
1856       nReserve = 0;
1857     }else{
1858       nReserve = zDbHeader[20];
1859       pBt->pageSizeFixed = 1;
1860 #ifndef SQLITE_OMIT_AUTOVACUUM
1861       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1862       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1863 #endif
1864     }
1865     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
1866     if( rc ) goto btree_open_out;
1867     pBt->usableSize = pBt->pageSize - nReserve;
1868     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
1869 
1870 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1871     /* Add the new BtShared object to the linked list sharable BtShareds.
1872     */
1873     if( p->sharable ){
1874       sqlite3_mutex *mutexShared;
1875       pBt->nRef = 1;
1876       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1877       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
1878         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1879         if( pBt->mutex==0 ){
1880           rc = SQLITE_NOMEM;
1881           db->mallocFailed = 0;
1882           goto btree_open_out;
1883         }
1884       }
1885       sqlite3_mutex_enter(mutexShared);
1886       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1887       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
1888       sqlite3_mutex_leave(mutexShared);
1889     }
1890 #endif
1891   }
1892 
1893 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1894   /* If the new Btree uses a sharable pBtShared, then link the new
1895   ** Btree into the list of all sharable Btrees for the same connection.
1896   ** The list is kept in ascending order by pBt address.
1897   */
1898   if( p->sharable ){
1899     int i;
1900     Btree *pSib;
1901     for(i=0; i<db->nDb; i++){
1902       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1903         while( pSib->pPrev ){ pSib = pSib->pPrev; }
1904         if( p->pBt<pSib->pBt ){
1905           p->pNext = pSib;
1906           p->pPrev = 0;
1907           pSib->pPrev = p;
1908         }else{
1909           while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1910             pSib = pSib->pNext;
1911           }
1912           p->pNext = pSib->pNext;
1913           p->pPrev = pSib;
1914           if( p->pNext ){
1915             p->pNext->pPrev = p;
1916           }
1917           pSib->pNext = p;
1918         }
1919         break;
1920       }
1921     }
1922   }
1923 #endif
1924   *ppBtree = p;
1925 
1926 btree_open_out:
1927   if( rc!=SQLITE_OK ){
1928     if( pBt && pBt->pPager ){
1929       sqlite3PagerClose(pBt->pPager);
1930     }
1931     sqlite3_free(pBt);
1932     sqlite3_free(p);
1933     *ppBtree = 0;
1934   }else{
1935     /* If the B-Tree was successfully opened, set the pager-cache size to the
1936     ** default value. Except, when opening on an existing shared pager-cache,
1937     ** do not change the pager-cache size.
1938     */
1939     if( sqlite3BtreeSchema(p, 0, 0)==0 ){
1940       sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
1941     }
1942   }
1943   if( mutexOpen ){
1944     assert( sqlite3_mutex_held(mutexOpen) );
1945     sqlite3_mutex_leave(mutexOpen);
1946   }
1947   return rc;
1948 }
1949 
1950 /*
1951 ** Decrement the BtShared.nRef counter.  When it reaches zero,
1952 ** remove the BtShared structure from the sharing list.  Return
1953 ** true if the BtShared.nRef counter reaches zero and return
1954 ** false if it is still positive.
1955 */
removeFromSharingList(BtShared * pBt)1956 static int removeFromSharingList(BtShared *pBt){
1957 #ifndef SQLITE_OMIT_SHARED_CACHE
1958   sqlite3_mutex *pMaster;
1959   BtShared *pList;
1960   int removed = 0;
1961 
1962   assert( sqlite3_mutex_notheld(pBt->mutex) );
1963   pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1964   sqlite3_mutex_enter(pMaster);
1965   pBt->nRef--;
1966   if( pBt->nRef<=0 ){
1967     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1968       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
1969     }else{
1970       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
1971       while( ALWAYS(pList) && pList->pNext!=pBt ){
1972         pList=pList->pNext;
1973       }
1974       if( ALWAYS(pList) ){
1975         pList->pNext = pBt->pNext;
1976       }
1977     }
1978     if( SQLITE_THREADSAFE ){
1979       sqlite3_mutex_free(pBt->mutex);
1980     }
1981     removed = 1;
1982   }
1983   sqlite3_mutex_leave(pMaster);
1984   return removed;
1985 #else
1986   return 1;
1987 #endif
1988 }
1989 
1990 /*
1991 ** Make sure pBt->pTmpSpace points to an allocation of
1992 ** MX_CELL_SIZE(pBt) bytes.
1993 */
allocateTempSpace(BtShared * pBt)1994 static void allocateTempSpace(BtShared *pBt){
1995   if( !pBt->pTmpSpace ){
1996     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
1997   }
1998 }
1999 
2000 /*
2001 ** Free the pBt->pTmpSpace allocation
2002 */
freeTempSpace(BtShared * pBt)2003 static void freeTempSpace(BtShared *pBt){
2004   sqlite3PageFree( pBt->pTmpSpace);
2005   pBt->pTmpSpace = 0;
2006 }
2007 
2008 /*
2009 ** Close an open database and invalidate all cursors.
2010 */
sqlite3BtreeClose(Btree * p)2011 int sqlite3BtreeClose(Btree *p){
2012   BtShared *pBt = p->pBt;
2013   BtCursor *pCur;
2014 
2015   /* Close all cursors opened via this handle.  */
2016   assert( sqlite3_mutex_held(p->db->mutex) );
2017   sqlite3BtreeEnter(p);
2018   pCur = pBt->pCursor;
2019   while( pCur ){
2020     BtCursor *pTmp = pCur;
2021     pCur = pCur->pNext;
2022     if( pTmp->pBtree==p ){
2023       sqlite3BtreeCloseCursor(pTmp);
2024     }
2025   }
2026 
2027   /* Rollback any active transaction and free the handle structure.
2028   ** The call to sqlite3BtreeRollback() drops any table-locks held by
2029   ** this handle.
2030   */
2031   sqlite3BtreeRollback(p);
2032   sqlite3BtreeLeave(p);
2033 
2034   /* If there are still other outstanding references to the shared-btree
2035   ** structure, return now. The remainder of this procedure cleans
2036   ** up the shared-btree.
2037   */
2038   assert( p->wantToLock==0 && p->locked==0 );
2039   if( !p->sharable || removeFromSharingList(pBt) ){
2040     /* The pBt is no longer on the sharing list, so we can access
2041     ** it without having to hold the mutex.
2042     **
2043     ** Clean out and delete the BtShared object.
2044     */
2045     assert( !pBt->pCursor );
2046     sqlite3PagerClose(pBt->pPager);
2047     if( pBt->xFreeSchema && pBt->pSchema ){
2048       pBt->xFreeSchema(pBt->pSchema);
2049     }
2050     sqlite3DbFree(0, pBt->pSchema);
2051     freeTempSpace(pBt);
2052     sqlite3_free(pBt);
2053   }
2054 
2055 #ifndef SQLITE_OMIT_SHARED_CACHE
2056   assert( p->wantToLock==0 );
2057   assert( p->locked==0 );
2058   if( p->pPrev ) p->pPrev->pNext = p->pNext;
2059   if( p->pNext ) p->pNext->pPrev = p->pPrev;
2060 #endif
2061 
2062   sqlite3_free(p);
2063   return SQLITE_OK;
2064 }
2065 
2066 /*
2067 ** Change the limit on the number of pages allowed in the cache.
2068 **
2069 ** The maximum number of cache pages is set to the absolute
2070 ** value of mxPage.  If mxPage is negative, the pager will
2071 ** operate asynchronously - it will not stop to do fsync()s
2072 ** to insure data is written to the disk surface before
2073 ** continuing.  Transactions still work if synchronous is off,
2074 ** and the database cannot be corrupted if this program
2075 ** crashes.  But if the operating system crashes or there is
2076 ** an abrupt power failure when synchronous is off, the database
2077 ** could be left in an inconsistent and unrecoverable state.
2078 ** Synchronous is on by default so database corruption is not
2079 ** normally a worry.
2080 */
sqlite3BtreeSetCacheSize(Btree * p,int mxPage)2081 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2082   BtShared *pBt = p->pBt;
2083   assert( sqlite3_mutex_held(p->db->mutex) );
2084   sqlite3BtreeEnter(p);
2085   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2086   sqlite3BtreeLeave(p);
2087   return SQLITE_OK;
2088 }
2089 
2090 /*
2091 ** Change the way data is synced to disk in order to increase or decrease
2092 ** how well the database resists damage due to OS crashes and power
2093 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
2094 ** there is a high probability of damage)  Level 2 is the default.  There
2095 ** is a very low but non-zero probability of damage.  Level 3 reduces the
2096 ** probability of damage to near zero but with a write performance reduction.
2097 */
2098 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
sqlite3BtreeSetSafetyLevel(Btree * p,int level,int fullSync,int ckptFullSync)2099 int sqlite3BtreeSetSafetyLevel(
2100   Btree *p,              /* The btree to set the safety level on */
2101   int level,             /* PRAGMA synchronous.  1=OFF, 2=NORMAL, 3=FULL */
2102   int fullSync,          /* PRAGMA fullfsync. */
2103   int ckptFullSync       /* PRAGMA checkpoint_fullfync */
2104 ){
2105   BtShared *pBt = p->pBt;
2106   assert( sqlite3_mutex_held(p->db->mutex) );
2107   assert( level>=1 && level<=3 );
2108   sqlite3BtreeEnter(p);
2109   sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync, ckptFullSync);
2110   sqlite3BtreeLeave(p);
2111   return SQLITE_OK;
2112 }
2113 #endif
2114 
2115 /*
2116 ** Return TRUE if the given btree is set to safety level 1.  In other
2117 ** words, return TRUE if no sync() occurs on the disk files.
2118 */
sqlite3BtreeSyncDisabled(Btree * p)2119 int sqlite3BtreeSyncDisabled(Btree *p){
2120   BtShared *pBt = p->pBt;
2121   int rc;
2122   assert( sqlite3_mutex_held(p->db->mutex) );
2123   sqlite3BtreeEnter(p);
2124   assert( pBt && pBt->pPager );
2125   rc = sqlite3PagerNosync(pBt->pPager);
2126   sqlite3BtreeLeave(p);
2127   return rc;
2128 }
2129 
2130 /*
2131 ** Change the default pages size and the number of reserved bytes per page.
2132 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2133 ** without changing anything.
2134 **
2135 ** The page size must be a power of 2 between 512 and 65536.  If the page
2136 ** size supplied does not meet this constraint then the page size is not
2137 ** changed.
2138 **
2139 ** Page sizes are constrained to be a power of two so that the region
2140 ** of the database file used for locking (beginning at PENDING_BYTE,
2141 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2142 ** at the beginning of a page.
2143 **
2144 ** If parameter nReserve is less than zero, then the number of reserved
2145 ** bytes per page is left unchanged.
2146 **
2147 ** If the iFix!=0 then the pageSizeFixed flag is set so that the page size
2148 ** and autovacuum mode can no longer be changed.
2149 */
sqlite3BtreeSetPageSize(Btree * p,int pageSize,int nReserve,int iFix)2150 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2151   int rc = SQLITE_OK;
2152   BtShared *pBt = p->pBt;
2153   assert( nReserve>=-1 && nReserve<=255 );
2154   sqlite3BtreeEnter(p);
2155   if( pBt->pageSizeFixed ){
2156     sqlite3BtreeLeave(p);
2157     return SQLITE_READONLY;
2158   }
2159   if( nReserve<0 ){
2160     nReserve = pBt->pageSize - pBt->usableSize;
2161   }
2162   assert( nReserve>=0 && nReserve<=255 );
2163   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2164         ((pageSize-1)&pageSize)==0 ){
2165     assert( (pageSize & 7)==0 );
2166     assert( !pBt->pPage1 && !pBt->pCursor );
2167     pBt->pageSize = (u32)pageSize;
2168     freeTempSpace(pBt);
2169   }
2170   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2171   pBt->usableSize = pBt->pageSize - (u16)nReserve;
2172   if( iFix ) pBt->pageSizeFixed = 1;
2173   sqlite3BtreeLeave(p);
2174   return rc;
2175 }
2176 
2177 /*
2178 ** Return the currently defined page size
2179 */
sqlite3BtreeGetPageSize(Btree * p)2180 int sqlite3BtreeGetPageSize(Btree *p){
2181   return p->pBt->pageSize;
2182 }
2183 
2184 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
2185 /*
2186 ** Return the number of bytes of space at the end of every page that
2187 ** are intentually left unused.  This is the "reserved" space that is
2188 ** sometimes used by extensions.
2189 */
sqlite3BtreeGetReserve(Btree * p)2190 int sqlite3BtreeGetReserve(Btree *p){
2191   int n;
2192   sqlite3BtreeEnter(p);
2193   n = p->pBt->pageSize - p->pBt->usableSize;
2194   sqlite3BtreeLeave(p);
2195   return n;
2196 }
2197 
2198 /*
2199 ** Set the maximum page count for a database if mxPage is positive.
2200 ** No changes are made if mxPage is 0 or negative.
2201 ** Regardless of the value of mxPage, return the maximum page count.
2202 */
sqlite3BtreeMaxPageCount(Btree * p,int mxPage)2203 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
2204   int n;
2205   sqlite3BtreeEnter(p);
2206   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2207   sqlite3BtreeLeave(p);
2208   return n;
2209 }
2210 
2211 /*
2212 ** Set the secureDelete flag if newFlag is 0 or 1.  If newFlag is -1,
2213 ** then make no changes.  Always return the value of the secureDelete
2214 ** setting after the change.
2215 */
sqlite3BtreeSecureDelete(Btree * p,int newFlag)2216 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2217   int b;
2218   if( p==0 ) return 0;
2219   sqlite3BtreeEnter(p);
2220   if( newFlag>=0 ){
2221     p->pBt->secureDelete = (newFlag!=0) ? 1 : 0;
2222   }
2223   b = p->pBt->secureDelete;
2224   sqlite3BtreeLeave(p);
2225   return b;
2226 }
2227 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
2228 
2229 /*
2230 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2231 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2232 ** is disabled. The default value for the auto-vacuum property is
2233 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2234 */
sqlite3BtreeSetAutoVacuum(Btree * p,int autoVacuum)2235 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2236 #ifdef SQLITE_OMIT_AUTOVACUUM
2237   return SQLITE_READONLY;
2238 #else
2239   BtShared *pBt = p->pBt;
2240   int rc = SQLITE_OK;
2241   u8 av = (u8)autoVacuum;
2242 
2243   sqlite3BtreeEnter(p);
2244   if( pBt->pageSizeFixed && (av ?1:0)!=pBt->autoVacuum ){
2245     rc = SQLITE_READONLY;
2246   }else{
2247     pBt->autoVacuum = av ?1:0;
2248     pBt->incrVacuum = av==2 ?1:0;
2249   }
2250   sqlite3BtreeLeave(p);
2251   return rc;
2252 #endif
2253 }
2254 
2255 /*
2256 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
2257 ** enabled 1 is returned. Otherwise 0.
2258 */
sqlite3BtreeGetAutoVacuum(Btree * p)2259 int sqlite3BtreeGetAutoVacuum(Btree *p){
2260 #ifdef SQLITE_OMIT_AUTOVACUUM
2261   return BTREE_AUTOVACUUM_NONE;
2262 #else
2263   int rc;
2264   sqlite3BtreeEnter(p);
2265   rc = (
2266     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
2267     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
2268     BTREE_AUTOVACUUM_INCR
2269   );
2270   sqlite3BtreeLeave(p);
2271   return rc;
2272 #endif
2273 }
2274 
2275 
2276 /*
2277 ** Get a reference to pPage1 of the database file.  This will
2278 ** also acquire a readlock on that file.
2279 **
2280 ** SQLITE_OK is returned on success.  If the file is not a
2281 ** well-formed database file, then SQLITE_CORRUPT is returned.
2282 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
2283 ** is returned if we run out of memory.
2284 */
lockBtree(BtShared * pBt)2285 static int lockBtree(BtShared *pBt){
2286   int rc;              /* Result code from subfunctions */
2287   MemPage *pPage1;     /* Page 1 of the database file */
2288   int nPage;           /* Number of pages in the database */
2289   int nPageFile = 0;   /* Number of pages in the database file */
2290   int nPageHeader;     /* Number of pages in the database according to hdr */
2291 
2292   assert( sqlite3_mutex_held(pBt->mutex) );
2293   assert( pBt->pPage1==0 );
2294   rc = sqlite3PagerSharedLock(pBt->pPager);
2295   if( rc!=SQLITE_OK ) return rc;
2296   rc = btreeGetPage(pBt, 1, &pPage1, 0);
2297   if( rc!=SQLITE_OK ) return rc;
2298 
2299   /* Do some checking to help insure the file we opened really is
2300   ** a valid database file.
2301   */
2302   nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
2303   sqlite3PagerPagecount(pBt->pPager, &nPageFile);
2304   if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
2305     nPage = nPageFile;
2306   }
2307   if( nPage>0 ){
2308     u32 pageSize;
2309     u32 usableSize;
2310     u8 *page1 = pPage1->aData;
2311     rc = SQLITE_NOTADB;
2312     if( memcmp(page1, zMagicHeader, 16)!=0 ){
2313       goto page1_init_failed;
2314     }
2315 
2316 #ifdef SQLITE_OMIT_WAL
2317     if( page1[18]>1 ){
2318       pBt->readOnly = 1;
2319     }
2320     if( page1[19]>1 ){
2321       goto page1_init_failed;
2322     }
2323 #else
2324     if( page1[18]>2 ){
2325       pBt->readOnly = 1;
2326     }
2327     if( page1[19]>2 ){
2328       goto page1_init_failed;
2329     }
2330 
2331     /* If the write version is set to 2, this database should be accessed
2332     ** in WAL mode. If the log is not already open, open it now. Then
2333     ** return SQLITE_OK and return without populating BtShared.pPage1.
2334     ** The caller detects this and calls this function again. This is
2335     ** required as the version of page 1 currently in the page1 buffer
2336     ** may not be the latest version - there may be a newer one in the log
2337     ** file.
2338     */
2339     if( page1[19]==2 && pBt->doNotUseWAL==0 ){
2340       int isOpen = 0;
2341       rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
2342       if( rc!=SQLITE_OK ){
2343         goto page1_init_failed;
2344       }else if( isOpen==0 ){
2345         releasePage(pPage1);
2346         return SQLITE_OK;
2347       }
2348       rc = SQLITE_NOTADB;
2349     }
2350 #endif
2351 
2352     /* The maximum embedded fraction must be exactly 25%.  And the minimum
2353     ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
2354     ** The original design allowed these amounts to vary, but as of
2355     ** version 3.6.0, we require them to be fixed.
2356     */
2357     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
2358       goto page1_init_failed;
2359     }
2360     pageSize = (page1[16]<<8) | (page1[17]<<16);
2361     if( ((pageSize-1)&pageSize)!=0
2362      || pageSize>SQLITE_MAX_PAGE_SIZE
2363      || pageSize<=256
2364     ){
2365       goto page1_init_failed;
2366     }
2367     assert( (pageSize & 7)==0 );
2368     usableSize = pageSize - page1[20];
2369     if( (u32)pageSize!=pBt->pageSize ){
2370       /* After reading the first page of the database assuming a page size
2371       ** of BtShared.pageSize, we have discovered that the page-size is
2372       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
2373       ** zero and return SQLITE_OK. The caller will call this function
2374       ** again with the correct page-size.
2375       */
2376       releasePage(pPage1);
2377       pBt->usableSize = usableSize;
2378       pBt->pageSize = pageSize;
2379       freeTempSpace(pBt);
2380       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
2381                                    pageSize-usableSize);
2382       return rc;
2383     }
2384     if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){
2385       rc = SQLITE_CORRUPT_BKPT;
2386       goto page1_init_failed;
2387     }
2388     if( usableSize<480 ){
2389       goto page1_init_failed;
2390     }
2391     pBt->pageSize = pageSize;
2392     pBt->usableSize = usableSize;
2393 #ifndef SQLITE_OMIT_AUTOVACUUM
2394     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
2395     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
2396 #endif
2397   }
2398 
2399   /* maxLocal is the maximum amount of payload to store locally for
2400   ** a cell.  Make sure it is small enough so that at least minFanout
2401   ** cells can will fit on one page.  We assume a 10-byte page header.
2402   ** Besides the payload, the cell must store:
2403   **     2-byte pointer to the cell
2404   **     4-byte child pointer
2405   **     9-byte nKey value
2406   **     4-byte nData value
2407   **     4-byte overflow page pointer
2408   ** So a cell consists of a 2-byte pointer, a header which is as much as
2409   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
2410   ** page pointer.
2411   */
2412   pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
2413   pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
2414   pBt->maxLeaf = (u16)(pBt->usableSize - 35);
2415   pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
2416   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
2417   pBt->pPage1 = pPage1;
2418   pBt->nPage = nPage;
2419   return SQLITE_OK;
2420 
2421 page1_init_failed:
2422   releasePage(pPage1);
2423   pBt->pPage1 = 0;
2424   return rc;
2425 }
2426 
2427 /*
2428 ** If there are no outstanding cursors and we are not in the middle
2429 ** of a transaction but there is a read lock on the database, then
2430 ** this routine unrefs the first page of the database file which
2431 ** has the effect of releasing the read lock.
2432 **
2433 ** If there is a transaction in progress, this routine is a no-op.
2434 */
unlockBtreeIfUnused(BtShared * pBt)2435 static void unlockBtreeIfUnused(BtShared *pBt){
2436   assert( sqlite3_mutex_held(pBt->mutex) );
2437   assert( pBt->pCursor==0 || pBt->inTransaction>TRANS_NONE );
2438   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
2439     assert( pBt->pPage1->aData );
2440     assert( sqlite3PagerRefcount(pBt->pPager)==1 );
2441     assert( pBt->pPage1->aData );
2442     releasePage(pBt->pPage1);
2443     pBt->pPage1 = 0;
2444   }
2445 }
2446 
2447 /*
2448 ** If pBt points to an empty file then convert that empty file
2449 ** into a new empty database by initializing the first page of
2450 ** the database.
2451 */
newDatabase(BtShared * pBt)2452 static int newDatabase(BtShared *pBt){
2453   MemPage *pP1;
2454   unsigned char *data;
2455   int rc;
2456 
2457   assert( sqlite3_mutex_held(pBt->mutex) );
2458   if( pBt->nPage>0 ){
2459     return SQLITE_OK;
2460   }
2461   pP1 = pBt->pPage1;
2462   assert( pP1!=0 );
2463   data = pP1->aData;
2464   rc = sqlite3PagerWrite(pP1->pDbPage);
2465   if( rc ) return rc;
2466   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
2467   assert( sizeof(zMagicHeader)==16 );
2468   data[16] = (u8)((pBt->pageSize>>8)&0xff);
2469   data[17] = (u8)((pBt->pageSize>>16)&0xff);
2470   data[18] = 1;
2471   data[19] = 1;
2472   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
2473   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
2474   data[21] = 64;
2475   data[22] = 32;
2476   data[23] = 32;
2477   memset(&data[24], 0, 100-24);
2478   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
2479   pBt->pageSizeFixed = 1;
2480 #ifndef SQLITE_OMIT_AUTOVACUUM
2481   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
2482   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
2483   put4byte(&data[36 + 4*4], pBt->autoVacuum);
2484   put4byte(&data[36 + 7*4], pBt->incrVacuum);
2485 #endif
2486   pBt->nPage = 1;
2487   data[31] = 1;
2488   return SQLITE_OK;
2489 }
2490 
2491 /*
2492 ** Attempt to start a new transaction. A write-transaction
2493 ** is started if the second argument is nonzero, otherwise a read-
2494 ** transaction.  If the second argument is 2 or more and exclusive
2495 ** transaction is started, meaning that no other process is allowed
2496 ** to access the database.  A preexisting transaction may not be
2497 ** upgraded to exclusive by calling this routine a second time - the
2498 ** exclusivity flag only works for a new transaction.
2499 **
2500 ** A write-transaction must be started before attempting any
2501 ** changes to the database.  None of the following routines
2502 ** will work unless a transaction is started first:
2503 **
2504 **      sqlite3BtreeCreateTable()
2505 **      sqlite3BtreeCreateIndex()
2506 **      sqlite3BtreeClearTable()
2507 **      sqlite3BtreeDropTable()
2508 **      sqlite3BtreeInsert()
2509 **      sqlite3BtreeDelete()
2510 **      sqlite3BtreeUpdateMeta()
2511 **
2512 ** If an initial attempt to acquire the lock fails because of lock contention
2513 ** and the database was previously unlocked, then invoke the busy handler
2514 ** if there is one.  But if there was previously a read-lock, do not
2515 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
2516 ** returned when there is already a read-lock in order to avoid a deadlock.
2517 **
2518 ** Suppose there are two processes A and B.  A has a read lock and B has
2519 ** a reserved lock.  B tries to promote to exclusive but is blocked because
2520 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
2521 ** One or the other of the two processes must give way or there can be
2522 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
2523 ** when A already has a read lock, we encourage A to give up and let B
2524 ** proceed.
2525 */
sqlite3BtreeBeginTrans(Btree * p,int wrflag)2526 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
2527   sqlite3 *pBlock = 0;
2528   BtShared *pBt = p->pBt;
2529   int rc = SQLITE_OK;
2530 
2531   sqlite3BtreeEnter(p);
2532   btreeIntegrity(p);
2533 
2534   /* If the btree is already in a write-transaction, or it
2535   ** is already in a read-transaction and a read-transaction
2536   ** is requested, this is a no-op.
2537   */
2538   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
2539     goto trans_begun;
2540   }
2541 
2542   /* Write transactions are not possible on a read-only database */
2543   if( pBt->readOnly && wrflag ){
2544     rc = SQLITE_READONLY;
2545     goto trans_begun;
2546   }
2547 
2548 #ifndef SQLITE_OMIT_SHARED_CACHE
2549   /* If another database handle has already opened a write transaction
2550   ** on this shared-btree structure and a second write transaction is
2551   ** requested, return SQLITE_LOCKED.
2552   */
2553   if( (wrflag && pBt->inTransaction==TRANS_WRITE) || pBt->isPending ){
2554     pBlock = pBt->pWriter->db;
2555   }else if( wrflag>1 ){
2556     BtLock *pIter;
2557     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
2558       if( pIter->pBtree!=p ){
2559         pBlock = pIter->pBtree->db;
2560         break;
2561       }
2562     }
2563   }
2564   if( pBlock ){
2565     sqlite3ConnectionBlocked(p->db, pBlock);
2566     rc = SQLITE_LOCKED_SHAREDCACHE;
2567     goto trans_begun;
2568   }
2569 #endif
2570 
2571   /* Any read-only or read-write transaction implies a read-lock on
2572   ** page 1. So if some other shared-cache client already has a write-lock
2573   ** on page 1, the transaction cannot be opened. */
2574   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
2575   if( SQLITE_OK!=rc ) goto trans_begun;
2576 
2577   pBt->initiallyEmpty = (u8)(pBt->nPage==0);
2578   do {
2579     /* Call lockBtree() until either pBt->pPage1 is populated or
2580     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
2581     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
2582     ** reading page 1 it discovers that the page-size of the database
2583     ** file is not pBt->pageSize. In this case lockBtree() will update
2584     ** pBt->pageSize to the page-size of the file on disk.
2585     */
2586     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
2587 
2588     if( rc==SQLITE_OK && wrflag ){
2589       if( pBt->readOnly ){
2590         rc = SQLITE_READONLY;
2591       }else{
2592         rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
2593         if( rc==SQLITE_OK ){
2594           rc = newDatabase(pBt);
2595         }
2596       }
2597     }
2598 
2599     if( rc!=SQLITE_OK ){
2600       unlockBtreeIfUnused(pBt);
2601     }
2602   }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
2603           btreeInvokeBusyHandler(pBt) );
2604 
2605   if( rc==SQLITE_OK ){
2606     if( p->inTrans==TRANS_NONE ){
2607       pBt->nTransaction++;
2608 #ifndef SQLITE_OMIT_SHARED_CACHE
2609       if( p->sharable ){
2610 	assert( p->lock.pBtree==p && p->lock.iTable==1 );
2611         p->lock.eLock = READ_LOCK;
2612         p->lock.pNext = pBt->pLock;
2613         pBt->pLock = &p->lock;
2614       }
2615 #endif
2616     }
2617     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2618     if( p->inTrans>pBt->inTransaction ){
2619       pBt->inTransaction = p->inTrans;
2620     }
2621     if( wrflag ){
2622       MemPage *pPage1 = pBt->pPage1;
2623 #ifndef SQLITE_OMIT_SHARED_CACHE
2624       assert( !pBt->pWriter );
2625       pBt->pWriter = p;
2626       pBt->isExclusive = (u8)(wrflag>1);
2627 #endif
2628 
2629       /* If the db-size header field is incorrect (as it may be if an old
2630       ** client has been writing the database file), update it now. Doing
2631       ** this sooner rather than later means the database size can safely
2632       ** re-read the database size from page 1 if a savepoint or transaction
2633       ** rollback occurs within the transaction.
2634       */
2635       if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
2636         rc = sqlite3PagerWrite(pPage1->pDbPage);
2637         if( rc==SQLITE_OK ){
2638           put4byte(&pPage1->aData[28], pBt->nPage);
2639         }
2640       }
2641     }
2642   }
2643 
2644 
2645 trans_begun:
2646   if( rc==SQLITE_OK && wrflag ){
2647     /* This call makes sure that the pager has the correct number of
2648     ** open savepoints. If the second parameter is greater than 0 and
2649     ** the sub-journal is not already open, then it will be opened here.
2650     */
2651     rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
2652   }
2653 
2654   btreeIntegrity(p);
2655   sqlite3BtreeLeave(p);
2656   return rc;
2657 }
2658 
2659 #ifndef SQLITE_OMIT_AUTOVACUUM
2660 
2661 /*
2662 ** Set the pointer-map entries for all children of page pPage. Also, if
2663 ** pPage contains cells that point to overflow pages, set the pointer
2664 ** map entries for the overflow pages as well.
2665 */
setChildPtrmaps(MemPage * pPage)2666 static int setChildPtrmaps(MemPage *pPage){
2667   int i;                             /* Counter variable */
2668   int nCell;                         /* Number of cells in page pPage */
2669   int rc;                            /* Return code */
2670   BtShared *pBt = pPage->pBt;
2671   u8 isInitOrig = pPage->isInit;
2672   Pgno pgno = pPage->pgno;
2673 
2674   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2675   rc = btreeInitPage(pPage);
2676   if( rc!=SQLITE_OK ){
2677     goto set_child_ptrmaps_out;
2678   }
2679   nCell = pPage->nCell;
2680 
2681   for(i=0; i<nCell; i++){
2682     u8 *pCell = findCell(pPage, i);
2683 
2684     ptrmapPutOvflPtr(pPage, pCell, &rc);
2685 
2686     if( !pPage->leaf ){
2687       Pgno childPgno = get4byte(pCell);
2688       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
2689     }
2690   }
2691 
2692   if( !pPage->leaf ){
2693     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2694     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
2695   }
2696 
2697 set_child_ptrmaps_out:
2698   pPage->isInit = isInitOrig;
2699   return rc;
2700 }
2701 
2702 /*
2703 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
2704 ** that it points to iTo. Parameter eType describes the type of pointer to
2705 ** be modified, as  follows:
2706 **
2707 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
2708 **                   page of pPage.
2709 **
2710 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2711 **                   page pointed to by one of the cells on pPage.
2712 **
2713 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2714 **                   overflow page in the list.
2715 */
modifyPagePointer(MemPage * pPage,Pgno iFrom,Pgno iTo,u8 eType)2716 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
2717   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2718   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
2719   if( eType==PTRMAP_OVERFLOW2 ){
2720     /* The pointer is always the first 4 bytes of the page in this case.  */
2721     if( get4byte(pPage->aData)!=iFrom ){
2722       return SQLITE_CORRUPT_BKPT;
2723     }
2724     put4byte(pPage->aData, iTo);
2725   }else{
2726     u8 isInitOrig = pPage->isInit;
2727     int i;
2728     int nCell;
2729 
2730     btreeInitPage(pPage);
2731     nCell = pPage->nCell;
2732 
2733     for(i=0; i<nCell; i++){
2734       u8 *pCell = findCell(pPage, i);
2735       if( eType==PTRMAP_OVERFLOW1 ){
2736         CellInfo info;
2737         btreeParseCellPtr(pPage, pCell, &info);
2738         if( info.iOverflow ){
2739           if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2740             put4byte(&pCell[info.iOverflow], iTo);
2741             break;
2742           }
2743         }
2744       }else{
2745         if( get4byte(pCell)==iFrom ){
2746           put4byte(pCell, iTo);
2747           break;
2748         }
2749       }
2750     }
2751 
2752     if( i==nCell ){
2753       if( eType!=PTRMAP_BTREE ||
2754           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
2755         return SQLITE_CORRUPT_BKPT;
2756       }
2757       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2758     }
2759 
2760     pPage->isInit = isInitOrig;
2761   }
2762   return SQLITE_OK;
2763 }
2764 
2765 
2766 /*
2767 ** Move the open database page pDbPage to location iFreePage in the
2768 ** database. The pDbPage reference remains valid.
2769 **
2770 ** The isCommit flag indicates that there is no need to remember that
2771 ** the journal needs to be sync()ed before database page pDbPage->pgno
2772 ** can be written to. The caller has already promised not to write to that
2773 ** page.
2774 */
relocatePage(BtShared * pBt,MemPage * pDbPage,u8 eType,Pgno iPtrPage,Pgno iFreePage,int isCommit)2775 static int relocatePage(
2776   BtShared *pBt,           /* Btree */
2777   MemPage *pDbPage,        /* Open page to move */
2778   u8 eType,                /* Pointer map 'type' entry for pDbPage */
2779   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
2780   Pgno iFreePage,          /* The location to move pDbPage to */
2781   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
2782 ){
2783   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
2784   Pgno iDbPage = pDbPage->pgno;
2785   Pager *pPager = pBt->pPager;
2786   int rc;
2787 
2788   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2789       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2790   assert( sqlite3_mutex_held(pBt->mutex) );
2791   assert( pDbPage->pBt==pBt );
2792 
2793   /* Move page iDbPage from its current location to page number iFreePage */
2794   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2795       iDbPage, iFreePage, iPtrPage, eType));
2796   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
2797   if( rc!=SQLITE_OK ){
2798     return rc;
2799   }
2800   pDbPage->pgno = iFreePage;
2801 
2802   /* If pDbPage was a btree-page, then it may have child pages and/or cells
2803   ** that point to overflow pages. The pointer map entries for all these
2804   ** pages need to be changed.
2805   **
2806   ** If pDbPage is an overflow page, then the first 4 bytes may store a
2807   ** pointer to a subsequent overflow page. If this is the case, then
2808   ** the pointer map needs to be updated for the subsequent overflow page.
2809   */
2810   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2811     rc = setChildPtrmaps(pDbPage);
2812     if( rc!=SQLITE_OK ){
2813       return rc;
2814     }
2815   }else{
2816     Pgno nextOvfl = get4byte(pDbPage->aData);
2817     if( nextOvfl!=0 ){
2818       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
2819       if( rc!=SQLITE_OK ){
2820         return rc;
2821       }
2822     }
2823   }
2824 
2825   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2826   ** that it points at iFreePage. Also fix the pointer map entry for
2827   ** iPtrPage.
2828   */
2829   if( eType!=PTRMAP_ROOTPAGE ){
2830     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
2831     if( rc!=SQLITE_OK ){
2832       return rc;
2833     }
2834     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2835     if( rc!=SQLITE_OK ){
2836       releasePage(pPtrPage);
2837       return rc;
2838     }
2839     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
2840     releasePage(pPtrPage);
2841     if( rc==SQLITE_OK ){
2842       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
2843     }
2844   }
2845   return rc;
2846 }
2847 
2848 /* Forward declaration required by incrVacuumStep(). */
2849 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
2850 
2851 /*
2852 ** Perform a single step of an incremental-vacuum. If successful,
2853 ** return SQLITE_OK. If there is no work to do (and therefore no
2854 ** point in calling this function again), return SQLITE_DONE.
2855 **
2856 ** More specificly, this function attempts to re-organize the
2857 ** database so that the last page of the file currently in use
2858 ** is no longer in use.
2859 **
2860 ** If the nFin parameter is non-zero, this function assumes
2861 ** that the caller will keep calling incrVacuumStep() until
2862 ** it returns SQLITE_DONE or an error, and that nFin is the
2863 ** number of pages the database file will contain after this
2864 ** process is complete.  If nFin is zero, it is assumed that
2865 ** incrVacuumStep() will be called a finite amount of times
2866 ** which may or may not empty the freelist.  A full autovacuum
2867 ** has nFin>0.  A "PRAGMA incremental_vacuum" has nFin==0.
2868 */
incrVacuumStep(BtShared * pBt,Pgno nFin,Pgno iLastPg)2869 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg){
2870   Pgno nFreeList;           /* Number of pages still on the free-list */
2871   int rc;
2872 
2873   assert( sqlite3_mutex_held(pBt->mutex) );
2874   assert( iLastPg>nFin );
2875 
2876   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2877     u8 eType;
2878     Pgno iPtrPage;
2879 
2880     nFreeList = get4byte(&pBt->pPage1->aData[36]);
2881     if( nFreeList==0 ){
2882       return SQLITE_DONE;
2883     }
2884 
2885     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2886     if( rc!=SQLITE_OK ){
2887       return rc;
2888     }
2889     if( eType==PTRMAP_ROOTPAGE ){
2890       return SQLITE_CORRUPT_BKPT;
2891     }
2892 
2893     if( eType==PTRMAP_FREEPAGE ){
2894       if( nFin==0 ){
2895         /* Remove the page from the files free-list. This is not required
2896         ** if nFin is non-zero. In that case, the free-list will be
2897         ** truncated to zero after this function returns, so it doesn't
2898         ** matter if it still contains some garbage entries.
2899         */
2900         Pgno iFreePg;
2901         MemPage *pFreePg;
2902         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2903         if( rc!=SQLITE_OK ){
2904           return rc;
2905         }
2906         assert( iFreePg==iLastPg );
2907         releasePage(pFreePg);
2908       }
2909     } else {
2910       Pgno iFreePg;             /* Index of free page to move pLastPg to */
2911       MemPage *pLastPg;
2912 
2913       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
2914       if( rc!=SQLITE_OK ){
2915         return rc;
2916       }
2917 
2918       /* If nFin is zero, this loop runs exactly once and page pLastPg
2919       ** is swapped with the first free page pulled off the free list.
2920       **
2921       ** On the other hand, if nFin is greater than zero, then keep
2922       ** looping until a free-page located within the first nFin pages
2923       ** of the file is found.
2924       */
2925       do {
2926         MemPage *pFreePg;
2927         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2928         if( rc!=SQLITE_OK ){
2929           releasePage(pLastPg);
2930           return rc;
2931         }
2932         releasePage(pFreePg);
2933       }while( nFin!=0 && iFreePg>nFin );
2934       assert( iFreePg<iLastPg );
2935 
2936       rc = sqlite3PagerWrite(pLastPg->pDbPage);
2937       if( rc==SQLITE_OK ){
2938         rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
2939       }
2940       releasePage(pLastPg);
2941       if( rc!=SQLITE_OK ){
2942         return rc;
2943       }
2944     }
2945   }
2946 
2947   if( nFin==0 ){
2948     iLastPg--;
2949     while( iLastPg==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, iLastPg) ){
2950       if( PTRMAP_ISPAGE(pBt, iLastPg) ){
2951         MemPage *pPg;
2952         rc = btreeGetPage(pBt, iLastPg, &pPg, 0);
2953         if( rc!=SQLITE_OK ){
2954           return rc;
2955         }
2956         rc = sqlite3PagerWrite(pPg->pDbPage);
2957         releasePage(pPg);
2958         if( rc!=SQLITE_OK ){
2959           return rc;
2960         }
2961       }
2962       iLastPg--;
2963     }
2964     sqlite3PagerTruncateImage(pBt->pPager, iLastPg);
2965     pBt->nPage = iLastPg;
2966   }
2967   return SQLITE_OK;
2968 }
2969 
2970 /*
2971 ** A write-transaction must be opened before calling this function.
2972 ** It performs a single unit of work towards an incremental vacuum.
2973 **
2974 ** If the incremental vacuum is finished after this function has run,
2975 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
2976 ** SQLITE_OK is returned. Otherwise an SQLite error code.
2977 */
sqlite3BtreeIncrVacuum(Btree * p)2978 int sqlite3BtreeIncrVacuum(Btree *p){
2979   int rc;
2980   BtShared *pBt = p->pBt;
2981 
2982   sqlite3BtreeEnter(p);
2983   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2984   if( !pBt->autoVacuum ){
2985     rc = SQLITE_DONE;
2986   }else{
2987     invalidateAllOverflowCache(pBt);
2988     rc = incrVacuumStep(pBt, 0, btreePagecount(pBt));
2989     if( rc==SQLITE_OK ){
2990       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
2991       put4byte(&pBt->pPage1->aData[28], pBt->nPage);
2992     }
2993   }
2994   sqlite3BtreeLeave(p);
2995   return rc;
2996 }
2997 
2998 /*
2999 ** This routine is called prior to sqlite3PagerCommit when a transaction
3000 ** is commited for an auto-vacuum database.
3001 **
3002 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3003 ** the database file should be truncated to during the commit process.
3004 ** i.e. the database has been reorganized so that only the first *pnTrunc
3005 ** pages are in use.
3006 */
autoVacuumCommit(BtShared * pBt)3007 static int autoVacuumCommit(BtShared *pBt){
3008   int rc = SQLITE_OK;
3009   Pager *pPager = pBt->pPager;
3010   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
3011 
3012   assert( sqlite3_mutex_held(pBt->mutex) );
3013   invalidateAllOverflowCache(pBt);
3014   assert(pBt->autoVacuum);
3015   if( !pBt->incrVacuum ){
3016     Pgno nFin;         /* Number of pages in database after autovacuuming */
3017     Pgno nFree;        /* Number of pages on the freelist initially */
3018     Pgno nPtrmap;      /* Number of PtrMap pages to be freed */
3019     Pgno iFree;        /* The next page to be freed */
3020     int nEntry;        /* Number of entries on one ptrmap page */
3021     Pgno nOrig;        /* Database size before freeing */
3022 
3023     nOrig = btreePagecount(pBt);
3024     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3025       /* It is not possible to create a database for which the final page
3026       ** is either a pointer-map page or the pending-byte page. If one
3027       ** is encountered, this indicates corruption.
3028       */
3029       return SQLITE_CORRUPT_BKPT;
3030     }
3031 
3032     nFree = get4byte(&pBt->pPage1->aData[36]);
3033     nEntry = pBt->usableSize/5;
3034     nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3035     nFin = nOrig - nFree - nPtrmap;
3036     if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3037       nFin--;
3038     }
3039     while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3040       nFin--;
3041     }
3042     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3043 
3044     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3045       rc = incrVacuumStep(pBt, nFin, iFree);
3046     }
3047     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3048       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3049       put4byte(&pBt->pPage1->aData[32], 0);
3050       put4byte(&pBt->pPage1->aData[36], 0);
3051       put4byte(&pBt->pPage1->aData[28], nFin);
3052       sqlite3PagerTruncateImage(pBt->pPager, nFin);
3053       pBt->nPage = nFin;
3054     }
3055     if( rc!=SQLITE_OK ){
3056       sqlite3PagerRollback(pPager);
3057     }
3058   }
3059 
3060   assert( nRef==sqlite3PagerRefcount(pPager) );
3061   return rc;
3062 }
3063 
3064 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3065 # define setChildPtrmaps(x) SQLITE_OK
3066 #endif
3067 
3068 /*
3069 ** This routine does the first phase of a two-phase commit.  This routine
3070 ** causes a rollback journal to be created (if it does not already exist)
3071 ** and populated with enough information so that if a power loss occurs
3072 ** the database can be restored to its original state by playing back
3073 ** the journal.  Then the contents of the journal are flushed out to
3074 ** the disk.  After the journal is safely on oxide, the changes to the
3075 ** database are written into the database file and flushed to oxide.
3076 ** At the end of this call, the rollback journal still exists on the
3077 ** disk and we are still holding all locks, so the transaction has not
3078 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
3079 ** commit process.
3080 **
3081 ** This call is a no-op if no write-transaction is currently active on pBt.
3082 **
3083 ** Otherwise, sync the database file for the btree pBt. zMaster points to
3084 ** the name of a master journal file that should be written into the
3085 ** individual journal file, or is NULL, indicating no master journal file
3086 ** (single database transaction).
3087 **
3088 ** When this is called, the master journal should already have been
3089 ** created, populated with this journal pointer and synced to disk.
3090 **
3091 ** Once this is routine has returned, the only thing required to commit
3092 ** the write-transaction for this database file is to delete the journal.
3093 */
sqlite3BtreeCommitPhaseOne(Btree * p,const char * zMaster)3094 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
3095   int rc = SQLITE_OK;
3096   if( p->inTrans==TRANS_WRITE ){
3097     BtShared *pBt = p->pBt;
3098     sqlite3BtreeEnter(p);
3099 #ifndef SQLITE_OMIT_AUTOVACUUM
3100     if( pBt->autoVacuum ){
3101       rc = autoVacuumCommit(pBt);
3102       if( rc!=SQLITE_OK ){
3103         sqlite3BtreeLeave(p);
3104         return rc;
3105       }
3106     }
3107 #endif
3108     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
3109     sqlite3BtreeLeave(p);
3110   }
3111   return rc;
3112 }
3113 
3114 /*
3115 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
3116 ** at the conclusion of a transaction.
3117 */
btreeEndTransaction(Btree * p)3118 static void btreeEndTransaction(Btree *p){
3119   BtShared *pBt = p->pBt;
3120   assert( sqlite3BtreeHoldsMutex(p) );
3121 
3122   btreeClearHasContent(pBt);
3123   if( p->inTrans>TRANS_NONE && p->db->activeVdbeCnt>1 ){
3124     /* If there are other active statements that belong to this database
3125     ** handle, downgrade to a read-only transaction. The other statements
3126     ** may still be reading from the database.  */
3127     downgradeAllSharedCacheTableLocks(p);
3128     p->inTrans = TRANS_READ;
3129   }else{
3130     /* If the handle had any kind of transaction open, decrement the
3131     ** transaction count of the shared btree. If the transaction count
3132     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
3133     ** call below will unlock the pager.  */
3134     if( p->inTrans!=TRANS_NONE ){
3135       clearAllSharedCacheTableLocks(p);
3136       pBt->nTransaction--;
3137       if( 0==pBt->nTransaction ){
3138         pBt->inTransaction = TRANS_NONE;
3139       }
3140     }
3141 
3142     /* Set the current transaction state to TRANS_NONE and unlock the
3143     ** pager if this call closed the only read or write transaction.  */
3144     p->inTrans = TRANS_NONE;
3145     unlockBtreeIfUnused(pBt);
3146   }
3147 
3148   btreeIntegrity(p);
3149 }
3150 
3151 /*
3152 ** Commit the transaction currently in progress.
3153 **
3154 ** This routine implements the second phase of a 2-phase commit.  The
3155 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
3156 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
3157 ** routine did all the work of writing information out to disk and flushing the
3158 ** contents so that they are written onto the disk platter.  All this
3159 ** routine has to do is delete or truncate or zero the header in the
3160 ** the rollback journal (which causes the transaction to commit) and
3161 ** drop locks.
3162 **
3163 ** Normally, if an error occurs while the pager layer is attempting to
3164 ** finalize the underlying journal file, this function returns an error and
3165 ** the upper layer will attempt a rollback. However, if the second argument
3166 ** is non-zero then this b-tree transaction is part of a multi-file
3167 ** transaction. In this case, the transaction has already been committed
3168 ** (by deleting a master journal file) and the caller will ignore this
3169 ** functions return code. So, even if an error occurs in the pager layer,
3170 ** reset the b-tree objects internal state to indicate that the write
3171 ** transaction has been closed. This is quite safe, as the pager will have
3172 ** transitioned to the error state.
3173 **
3174 ** This will release the write lock on the database file.  If there
3175 ** are no active cursors, it also releases the read lock.
3176 */
sqlite3BtreeCommitPhaseTwo(Btree * p,int bCleanup)3177 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
3178 
3179   if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
3180   sqlite3BtreeEnter(p);
3181   btreeIntegrity(p);
3182 
3183   /* If the handle has a write-transaction open, commit the shared-btrees
3184   ** transaction and set the shared state to TRANS_READ.
3185   */
3186   if( p->inTrans==TRANS_WRITE ){
3187     int rc;
3188     BtShared *pBt = p->pBt;
3189     assert( pBt->inTransaction==TRANS_WRITE );
3190     assert( pBt->nTransaction>0 );
3191     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
3192     if( rc!=SQLITE_OK && bCleanup==0 ){
3193       sqlite3BtreeLeave(p);
3194       return rc;
3195     }
3196     pBt->inTransaction = TRANS_READ;
3197   }
3198 
3199   btreeEndTransaction(p);
3200   sqlite3BtreeLeave(p);
3201   return SQLITE_OK;
3202 }
3203 
3204 /*
3205 ** Do both phases of a commit.
3206 */
sqlite3BtreeCommit(Btree * p)3207 int sqlite3BtreeCommit(Btree *p){
3208   int rc;
3209   sqlite3BtreeEnter(p);
3210   rc = sqlite3BtreeCommitPhaseOne(p, 0);
3211   if( rc==SQLITE_OK ){
3212     rc = sqlite3BtreeCommitPhaseTwo(p, 0);
3213   }
3214   sqlite3BtreeLeave(p);
3215   return rc;
3216 }
3217 
3218 #ifndef NDEBUG
3219 /*
3220 ** Return the number of write-cursors open on this handle. This is for use
3221 ** in assert() expressions, so it is only compiled if NDEBUG is not
3222 ** defined.
3223 **
3224 ** For the purposes of this routine, a write-cursor is any cursor that
3225 ** is capable of writing to the databse.  That means the cursor was
3226 ** originally opened for writing and the cursor has not be disabled
3227 ** by having its state changed to CURSOR_FAULT.
3228 */
countWriteCursors(BtShared * pBt)3229 static int countWriteCursors(BtShared *pBt){
3230   BtCursor *pCur;
3231   int r = 0;
3232   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3233     if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
3234   }
3235   return r;
3236 }
3237 #endif
3238 
3239 /*
3240 ** This routine sets the state to CURSOR_FAULT and the error
3241 ** code to errCode for every cursor on BtShared that pBtree
3242 ** references.
3243 **
3244 ** Every cursor is tripped, including cursors that belong
3245 ** to other database connections that happen to be sharing
3246 ** the cache with pBtree.
3247 **
3248 ** This routine gets called when a rollback occurs.
3249 ** All cursors using the same cache must be tripped
3250 ** to prevent them from trying to use the btree after
3251 ** the rollback.  The rollback may have deleted tables
3252 ** or moved root pages, so it is not sufficient to
3253 ** save the state of the cursor.  The cursor must be
3254 ** invalidated.
3255 */
sqlite3BtreeTripAllCursors(Btree * pBtree,int errCode)3256 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
3257   BtCursor *p;
3258   sqlite3BtreeEnter(pBtree);
3259   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
3260     int i;
3261     sqlite3BtreeClearCursor(p);
3262     p->eState = CURSOR_FAULT;
3263     p->skipNext = errCode;
3264     for(i=0; i<=p->iPage; i++){
3265       releasePage(p->apPage[i]);
3266       p->apPage[i] = 0;
3267     }
3268   }
3269   sqlite3BtreeLeave(pBtree);
3270 }
3271 
3272 /*
3273 ** Rollback the transaction in progress.  All cursors will be
3274 ** invalided by this operation.  Any attempt to use a cursor
3275 ** that was open at the beginning of this operation will result
3276 ** in an error.
3277 **
3278 ** This will release the write lock on the database file.  If there
3279 ** are no active cursors, it also releases the read lock.
3280 */
sqlite3BtreeRollback(Btree * p)3281 int sqlite3BtreeRollback(Btree *p){
3282   int rc;
3283   BtShared *pBt = p->pBt;
3284   MemPage *pPage1;
3285 
3286   sqlite3BtreeEnter(p);
3287   rc = saveAllCursors(pBt, 0, 0);
3288 #ifndef SQLITE_OMIT_SHARED_CACHE
3289   if( rc!=SQLITE_OK ){
3290     /* This is a horrible situation. An IO or malloc() error occurred whilst
3291     ** trying to save cursor positions. If this is an automatic rollback (as
3292     ** the result of a constraint, malloc() failure or IO error) then
3293     ** the cache may be internally inconsistent (not contain valid trees) so
3294     ** we cannot simply return the error to the caller. Instead, abort
3295     ** all queries that may be using any of the cursors that failed to save.
3296     */
3297     sqlite3BtreeTripAllCursors(p, rc);
3298   }
3299 #endif
3300   btreeIntegrity(p);
3301 
3302   if( p->inTrans==TRANS_WRITE ){
3303     int rc2;
3304 
3305     assert( TRANS_WRITE==pBt->inTransaction );
3306     rc2 = sqlite3PagerRollback(pBt->pPager);
3307     if( rc2!=SQLITE_OK ){
3308       rc = rc2;
3309     }
3310 
3311     /* The rollback may have destroyed the pPage1->aData value.  So
3312     ** call btreeGetPage() on page 1 again to make
3313     ** sure pPage1->aData is set correctly. */
3314     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
3315       int nPage = get4byte(28+(u8*)pPage1->aData);
3316       testcase( nPage==0 );
3317       if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
3318       testcase( pBt->nPage!=nPage );
3319       pBt->nPage = nPage;
3320       releasePage(pPage1);
3321     }
3322     assert( countWriteCursors(pBt)==0 );
3323     pBt->inTransaction = TRANS_READ;
3324   }
3325 
3326   btreeEndTransaction(p);
3327   sqlite3BtreeLeave(p);
3328   return rc;
3329 }
3330 
3331 /*
3332 ** Start a statement subtransaction. The subtransaction can can be rolled
3333 ** back independently of the main transaction. You must start a transaction
3334 ** before starting a subtransaction. The subtransaction is ended automatically
3335 ** if the main transaction commits or rolls back.
3336 **
3337 ** Statement subtransactions are used around individual SQL statements
3338 ** that are contained within a BEGIN...COMMIT block.  If a constraint
3339 ** error occurs within the statement, the effect of that one statement
3340 ** can be rolled back without having to rollback the entire transaction.
3341 **
3342 ** A statement sub-transaction is implemented as an anonymous savepoint. The
3343 ** value passed as the second parameter is the total number of savepoints,
3344 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
3345 ** are no active savepoints and no other statement-transactions open,
3346 ** iStatement is 1. This anonymous savepoint can be released or rolled back
3347 ** using the sqlite3BtreeSavepoint() function.
3348 */
sqlite3BtreeBeginStmt(Btree * p,int iStatement)3349 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
3350   int rc;
3351   BtShared *pBt = p->pBt;
3352   sqlite3BtreeEnter(p);
3353   assert( p->inTrans==TRANS_WRITE );
3354   assert( pBt->readOnly==0 );
3355   assert( iStatement>0 );
3356   assert( iStatement>p->db->nSavepoint );
3357   assert( pBt->inTransaction==TRANS_WRITE );
3358   /* At the pager level, a statement transaction is a savepoint with
3359   ** an index greater than all savepoints created explicitly using
3360   ** SQL statements. It is illegal to open, release or rollback any
3361   ** such savepoints while the statement transaction savepoint is active.
3362   */
3363   rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
3364   sqlite3BtreeLeave(p);
3365   return rc;
3366 }
3367 
3368 /*
3369 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
3370 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
3371 ** savepoint identified by parameter iSavepoint, depending on the value
3372 ** of op.
3373 **
3374 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
3375 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
3376 ** contents of the entire transaction are rolled back. This is different
3377 ** from a normal transaction rollback, as no locks are released and the
3378 ** transaction remains open.
3379 */
sqlite3BtreeSavepoint(Btree * p,int op,int iSavepoint)3380 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
3381   int rc = SQLITE_OK;
3382   if( p && p->inTrans==TRANS_WRITE ){
3383     BtShared *pBt = p->pBt;
3384     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
3385     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
3386     sqlite3BtreeEnter(p);
3387     rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
3388     if( rc==SQLITE_OK ){
3389       if( iSavepoint<0 && pBt->initiallyEmpty ) pBt->nPage = 0;
3390       rc = newDatabase(pBt);
3391       pBt->nPage = get4byte(28 + pBt->pPage1->aData);
3392 
3393       /* The database size was written into the offset 28 of the header
3394       ** when the transaction started, so we know that the value at offset
3395       ** 28 is nonzero. */
3396       assert( pBt->nPage>0 );
3397     }
3398     sqlite3BtreeLeave(p);
3399   }
3400   return rc;
3401 }
3402 
3403 /*
3404 ** Create a new cursor for the BTree whose root is on the page
3405 ** iTable. If a read-only cursor is requested, it is assumed that
3406 ** the caller already has at least a read-only transaction open
3407 ** on the database already. If a write-cursor is requested, then
3408 ** the caller is assumed to have an open write transaction.
3409 **
3410 ** If wrFlag==0, then the cursor can only be used for reading.
3411 ** If wrFlag==1, then the cursor can be used for reading or for
3412 ** writing if other conditions for writing are also met.  These
3413 ** are the conditions that must be met in order for writing to
3414 ** be allowed:
3415 **
3416 ** 1:  The cursor must have been opened with wrFlag==1
3417 **
3418 ** 2:  Other database connections that share the same pager cache
3419 **     but which are not in the READ_UNCOMMITTED state may not have
3420 **     cursors open with wrFlag==0 on the same table.  Otherwise
3421 **     the changes made by this write cursor would be visible to
3422 **     the read cursors in the other database connection.
3423 **
3424 ** 3:  The database must be writable (not on read-only media)
3425 **
3426 ** 4:  There must be an active transaction.
3427 **
3428 ** No checking is done to make sure that page iTable really is the
3429 ** root page of a b-tree.  If it is not, then the cursor acquired
3430 ** will not work correctly.
3431 **
3432 ** It is assumed that the sqlite3BtreeCursorZero() has been called
3433 ** on pCur to initialize the memory space prior to invoking this routine.
3434 */
btreeCursor(Btree * p,int iTable,int wrFlag,struct KeyInfo * pKeyInfo,BtCursor * pCur)3435 static int btreeCursor(
3436   Btree *p,                              /* The btree */
3437   int iTable,                            /* Root page of table to open */
3438   int wrFlag,                            /* 1 to write. 0 read-only */
3439   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
3440   BtCursor *pCur                         /* Space for new cursor */
3441 ){
3442   BtShared *pBt = p->pBt;                /* Shared b-tree handle */
3443 
3444   assert( sqlite3BtreeHoldsMutex(p) );
3445   assert( wrFlag==0 || wrFlag==1 );
3446 
3447   /* The following assert statements verify that if this is a sharable
3448   ** b-tree database, the connection is holding the required table locks,
3449   ** and that no other connection has any open cursor that conflicts with
3450   ** this lock.  */
3451   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) );
3452   assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
3453 
3454   /* Assert that the caller has opened the required transaction. */
3455   assert( p->inTrans>TRANS_NONE );
3456   assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
3457   assert( pBt->pPage1 && pBt->pPage1->aData );
3458 
3459   if( NEVER(wrFlag && pBt->readOnly) ){
3460     return SQLITE_READONLY;
3461   }
3462   if( iTable==1 && btreePagecount(pBt)==0 ){
3463     return SQLITE_EMPTY;
3464   }
3465 
3466   /* Now that no other errors can occur, finish filling in the BtCursor
3467   ** variables and link the cursor into the BtShared list.  */
3468   pCur->pgnoRoot = (Pgno)iTable;
3469   pCur->iPage = -1;
3470   pCur->pKeyInfo = pKeyInfo;
3471   pCur->pBtree = p;
3472   pCur->pBt = pBt;
3473   pCur->wrFlag = (u8)wrFlag;
3474   pCur->pNext = pBt->pCursor;
3475   if( pCur->pNext ){
3476     pCur->pNext->pPrev = pCur;
3477   }
3478   pBt->pCursor = pCur;
3479   pCur->eState = CURSOR_INVALID;
3480   pCur->cachedRowid = 0;
3481   return SQLITE_OK;
3482 }
sqlite3BtreeCursor(Btree * p,int iTable,int wrFlag,struct KeyInfo * pKeyInfo,BtCursor * pCur)3483 int sqlite3BtreeCursor(
3484   Btree *p,                                   /* The btree */
3485   int iTable,                                 /* Root page of table to open */
3486   int wrFlag,                                 /* 1 to write. 0 read-only */
3487   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
3488   BtCursor *pCur                              /* Write new cursor here */
3489 ){
3490   int rc;
3491   sqlite3BtreeEnter(p);
3492   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
3493   sqlite3BtreeLeave(p);
3494   return rc;
3495 }
3496 
3497 /*
3498 ** Return the size of a BtCursor object in bytes.
3499 **
3500 ** This interfaces is needed so that users of cursors can preallocate
3501 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
3502 ** to users so they cannot do the sizeof() themselves - they must call
3503 ** this routine.
3504 */
sqlite3BtreeCursorSize(void)3505 int sqlite3BtreeCursorSize(void){
3506   return ROUND8(sizeof(BtCursor));
3507 }
3508 
3509 /*
3510 ** Initialize memory that will be converted into a BtCursor object.
3511 **
3512 ** The simple approach here would be to memset() the entire object
3513 ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
3514 ** do not need to be zeroed and they are large, so we can save a lot
3515 ** of run-time by skipping the initialization of those elements.
3516 */
sqlite3BtreeCursorZero(BtCursor * p)3517 void sqlite3BtreeCursorZero(BtCursor *p){
3518   memset(p, 0, offsetof(BtCursor, iPage));
3519 }
3520 
3521 /*
3522 ** Set the cached rowid value of every cursor in the same database file
3523 ** as pCur and having the same root page number as pCur.  The value is
3524 ** set to iRowid.
3525 **
3526 ** Only positive rowid values are considered valid for this cache.
3527 ** The cache is initialized to zero, indicating an invalid cache.
3528 ** A btree will work fine with zero or negative rowids.  We just cannot
3529 ** cache zero or negative rowids, which means tables that use zero or
3530 ** negative rowids might run a little slower.  But in practice, zero
3531 ** or negative rowids are very uncommon so this should not be a problem.
3532 */
sqlite3BtreeSetCachedRowid(BtCursor * pCur,sqlite3_int64 iRowid)3533 void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){
3534   BtCursor *p;
3535   for(p=pCur->pBt->pCursor; p; p=p->pNext){
3536     if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid;
3537   }
3538   assert( pCur->cachedRowid==iRowid );
3539 }
3540 
3541 /*
3542 ** Return the cached rowid for the given cursor.  A negative or zero
3543 ** return value indicates that the rowid cache is invalid and should be
3544 ** ignored.  If the rowid cache has never before been set, then a
3545 ** zero is returned.
3546 */
sqlite3BtreeGetCachedRowid(BtCursor * pCur)3547 sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){
3548   return pCur->cachedRowid;
3549 }
3550 
3551 /*
3552 ** Close a cursor.  The read lock on the database file is released
3553 ** when the last cursor is closed.
3554 */
sqlite3BtreeCloseCursor(BtCursor * pCur)3555 int sqlite3BtreeCloseCursor(BtCursor *pCur){
3556   Btree *pBtree = pCur->pBtree;
3557   if( pBtree ){
3558     int i;
3559     BtShared *pBt = pCur->pBt;
3560     sqlite3BtreeEnter(pBtree);
3561     sqlite3BtreeClearCursor(pCur);
3562     if( pCur->pPrev ){
3563       pCur->pPrev->pNext = pCur->pNext;
3564     }else{
3565       pBt->pCursor = pCur->pNext;
3566     }
3567     if( pCur->pNext ){
3568       pCur->pNext->pPrev = pCur->pPrev;
3569     }
3570     for(i=0; i<=pCur->iPage; i++){
3571       releasePage(pCur->apPage[i]);
3572     }
3573     unlockBtreeIfUnused(pBt);
3574     invalidateOverflowCache(pCur);
3575     /* sqlite3_free(pCur); */
3576     sqlite3BtreeLeave(pBtree);
3577   }
3578   return SQLITE_OK;
3579 }
3580 
3581 /*
3582 ** Make sure the BtCursor* given in the argument has a valid
3583 ** BtCursor.info structure.  If it is not already valid, call
3584 ** btreeParseCell() to fill it in.
3585 **
3586 ** BtCursor.info is a cache of the information in the current cell.
3587 ** Using this cache reduces the number of calls to btreeParseCell().
3588 **
3589 ** 2007-06-25:  There is a bug in some versions of MSVC that cause the
3590 ** compiler to crash when getCellInfo() is implemented as a macro.
3591 ** But there is a measureable speed advantage to using the macro on gcc
3592 ** (when less compiler optimizations like -Os or -O0 are used and the
3593 ** compiler is not doing agressive inlining.)  So we use a real function
3594 ** for MSVC and a macro for everything else.  Ticket #2457.
3595 */
3596 #ifndef NDEBUG
assertCellInfo(BtCursor * pCur)3597   static void assertCellInfo(BtCursor *pCur){
3598     CellInfo info;
3599     int iPage = pCur->iPage;
3600     memset(&info, 0, sizeof(info));
3601     btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
3602     assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
3603   }
3604 #else
3605   #define assertCellInfo(x)
3606 #endif
3607 #ifdef _MSC_VER
3608   /* Use a real function in MSVC to work around bugs in that compiler. */
getCellInfo(BtCursor * pCur)3609   static void getCellInfo(BtCursor *pCur){
3610     if( pCur->info.nSize==0 ){
3611       int iPage = pCur->iPage;
3612       btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
3613       pCur->validNKey = 1;
3614     }else{
3615       assertCellInfo(pCur);
3616     }
3617   }
3618 #else /* if not _MSC_VER */
3619   /* Use a macro in all other compilers so that the function is inlined */
3620 #define getCellInfo(pCur)                                                      \
3621   if( pCur->info.nSize==0 ){                                                   \
3622     int iPage = pCur->iPage;                                                   \
3623     btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
3624     pCur->validNKey = 1;                                                       \
3625   }else{                                                                       \
3626     assertCellInfo(pCur);                                                      \
3627   }
3628 #endif /* _MSC_VER */
3629 
3630 #ifndef NDEBUG  /* The next routine used only within assert() statements */
3631 /*
3632 ** Return true if the given BtCursor is valid.  A valid cursor is one
3633 ** that is currently pointing to a row in a (non-empty) table.
3634 ** This is a verification routine is used only within assert() statements.
3635 */
sqlite3BtreeCursorIsValid(BtCursor * pCur)3636 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
3637   return pCur && pCur->eState==CURSOR_VALID;
3638 }
3639 #endif /* NDEBUG */
3640 
3641 /*
3642 ** Set *pSize to the size of the buffer needed to hold the value of
3643 ** the key for the current entry.  If the cursor is not pointing
3644 ** to a valid entry, *pSize is set to 0.
3645 **
3646 ** For a table with the INTKEY flag set, this routine returns the key
3647 ** itself, not the number of bytes in the key.
3648 **
3649 ** The caller must position the cursor prior to invoking this routine.
3650 **
3651 ** This routine cannot fail.  It always returns SQLITE_OK.
3652 */
sqlite3BtreeKeySize(BtCursor * pCur,i64 * pSize)3653 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
3654   assert( cursorHoldsMutex(pCur) );
3655   assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3656   if( pCur->eState!=CURSOR_VALID ){
3657     *pSize = 0;
3658   }else{
3659     getCellInfo(pCur);
3660     *pSize = pCur->info.nKey;
3661   }
3662   return SQLITE_OK;
3663 }
3664 
3665 /*
3666 ** Set *pSize to the number of bytes of data in the entry the
3667 ** cursor currently points to.
3668 **
3669 ** The caller must guarantee that the cursor is pointing to a non-NULL
3670 ** valid entry.  In other words, the calling procedure must guarantee
3671 ** that the cursor has Cursor.eState==CURSOR_VALID.
3672 **
3673 ** Failure is not possible.  This function always returns SQLITE_OK.
3674 ** It might just as well be a procedure (returning void) but we continue
3675 ** to return an integer result code for historical reasons.
3676 */
sqlite3BtreeDataSize(BtCursor * pCur,u32 * pSize)3677 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
3678   assert( cursorHoldsMutex(pCur) );
3679   assert( pCur->eState==CURSOR_VALID );
3680   getCellInfo(pCur);
3681   *pSize = pCur->info.nData;
3682   return SQLITE_OK;
3683 }
3684 
3685 /*
3686 ** Given the page number of an overflow page in the database (parameter
3687 ** ovfl), this function finds the page number of the next page in the
3688 ** linked list of overflow pages. If possible, it uses the auto-vacuum
3689 ** pointer-map data instead of reading the content of page ovfl to do so.
3690 **
3691 ** If an error occurs an SQLite error code is returned. Otherwise:
3692 **
3693 ** The page number of the next overflow page in the linked list is
3694 ** written to *pPgnoNext. If page ovfl is the last page in its linked
3695 ** list, *pPgnoNext is set to zero.
3696 **
3697 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
3698 ** to page number pOvfl was obtained, then *ppPage is set to point to that
3699 ** reference. It is the responsibility of the caller to call releasePage()
3700 ** on *ppPage to free the reference. In no reference was obtained (because
3701 ** the pointer-map was used to obtain the value for *pPgnoNext), then
3702 ** *ppPage is set to zero.
3703 */
getOverflowPage(BtShared * pBt,Pgno ovfl,MemPage ** ppPage,Pgno * pPgnoNext)3704 static int getOverflowPage(
3705   BtShared *pBt,               /* The database file */
3706   Pgno ovfl,                   /* Current overflow page number */
3707   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
3708   Pgno *pPgnoNext              /* OUT: Next overflow page number */
3709 ){
3710   Pgno next = 0;
3711   MemPage *pPage = 0;
3712   int rc = SQLITE_OK;
3713 
3714   assert( sqlite3_mutex_held(pBt->mutex) );
3715   assert(pPgnoNext);
3716 
3717 #ifndef SQLITE_OMIT_AUTOVACUUM
3718   /* Try to find the next page in the overflow list using the
3719   ** autovacuum pointer-map pages. Guess that the next page in
3720   ** the overflow list is page number (ovfl+1). If that guess turns
3721   ** out to be wrong, fall back to loading the data of page
3722   ** number ovfl to determine the next page number.
3723   */
3724   if( pBt->autoVacuum ){
3725     Pgno pgno;
3726     Pgno iGuess = ovfl+1;
3727     u8 eType;
3728 
3729     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3730       iGuess++;
3731     }
3732 
3733     if( iGuess<=btreePagecount(pBt) ){
3734       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
3735       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
3736         next = iGuess;
3737         rc = SQLITE_DONE;
3738       }
3739     }
3740   }
3741 #endif
3742 
3743   assert( next==0 || rc==SQLITE_DONE );
3744   if( rc==SQLITE_OK ){
3745     rc = btreeGetPage(pBt, ovfl, &pPage, 0);
3746     assert( rc==SQLITE_OK || pPage==0 );
3747     if( rc==SQLITE_OK ){
3748       next = get4byte(pPage->aData);
3749     }
3750   }
3751 
3752   *pPgnoNext = next;
3753   if( ppPage ){
3754     *ppPage = pPage;
3755   }else{
3756     releasePage(pPage);
3757   }
3758   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
3759 }
3760 
3761 /*
3762 ** Copy data from a buffer to a page, or from a page to a buffer.
3763 **
3764 ** pPayload is a pointer to data stored on database page pDbPage.
3765 ** If argument eOp is false, then nByte bytes of data are copied
3766 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3767 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3768 ** of data are copied from the buffer pBuf to pPayload.
3769 **
3770 ** SQLITE_OK is returned on success, otherwise an error code.
3771 */
copyPayload(void * pPayload,void * pBuf,int nByte,int eOp,DbPage * pDbPage)3772 static int copyPayload(
3773   void *pPayload,           /* Pointer to page data */
3774   void *pBuf,               /* Pointer to buffer */
3775   int nByte,                /* Number of bytes to copy */
3776   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
3777   DbPage *pDbPage           /* Page containing pPayload */
3778 ){
3779   if( eOp ){
3780     /* Copy data from buffer to page (a write operation) */
3781     int rc = sqlite3PagerWrite(pDbPage);
3782     if( rc!=SQLITE_OK ){
3783       return rc;
3784     }
3785     memcpy(pPayload, pBuf, nByte);
3786   }else{
3787     /* Copy data from page to buffer (a read operation) */
3788     memcpy(pBuf, pPayload, nByte);
3789   }
3790   return SQLITE_OK;
3791 }
3792 
3793 /*
3794 ** This function is used to read or overwrite payload information
3795 ** for the entry that the pCur cursor is pointing to. If the eOp
3796 ** parameter is 0, this is a read operation (data copied into
3797 ** buffer pBuf). If it is non-zero, a write (data copied from
3798 ** buffer pBuf).
3799 **
3800 ** A total of "amt" bytes are read or written beginning at "offset".
3801 ** Data is read to or from the buffer pBuf.
3802 **
3803 ** The content being read or written might appear on the main page
3804 ** or be scattered out on multiple overflow pages.
3805 **
3806 ** If the BtCursor.isIncrblobHandle flag is set, and the current
3807 ** cursor entry uses one or more overflow pages, this function
3808 ** allocates space for and lazily popluates the overflow page-list
3809 ** cache array (BtCursor.aOverflow). Subsequent calls use this
3810 ** cache to make seeking to the supplied offset more efficient.
3811 **
3812 ** Once an overflow page-list cache has been allocated, it may be
3813 ** invalidated if some other cursor writes to the same table, or if
3814 ** the cursor is moved to a different row. Additionally, in auto-vacuum
3815 ** mode, the following events may invalidate an overflow page-list cache.
3816 **
3817 **   * An incremental vacuum,
3818 **   * A commit in auto_vacuum="full" mode,
3819 **   * Creating a table (may require moving an overflow page).
3820 */
accessPayload(BtCursor * pCur,u32 offset,u32 amt,unsigned char * pBuf,int eOp)3821 static int accessPayload(
3822   BtCursor *pCur,      /* Cursor pointing to entry to read from */
3823   u32 offset,          /* Begin reading this far into payload */
3824   u32 amt,             /* Read this many bytes */
3825   unsigned char *pBuf, /* Write the bytes into this buffer */
3826   int eOp              /* zero to read. non-zero to write. */
3827 ){
3828   unsigned char *aPayload;
3829   int rc = SQLITE_OK;
3830   u32 nKey;
3831   int iIdx = 0;
3832   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
3833   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
3834 
3835   assert( pPage );
3836   assert( pCur->eState==CURSOR_VALID );
3837   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3838   assert( cursorHoldsMutex(pCur) );
3839 
3840   getCellInfo(pCur);
3841   aPayload = pCur->info.pCell + pCur->info.nHeader;
3842   nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey);
3843 
3844   if( NEVER(offset+amt > nKey+pCur->info.nData)
3845    || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
3846   ){
3847     /* Trying to read or write past the end of the data is an error */
3848     return SQLITE_CORRUPT_BKPT;
3849   }
3850 
3851   /* Check if data must be read/written to/from the btree page itself. */
3852   if( offset<pCur->info.nLocal ){
3853     int a = amt;
3854     if( a+offset>pCur->info.nLocal ){
3855       a = pCur->info.nLocal - offset;
3856     }
3857     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
3858     offset = 0;
3859     pBuf += a;
3860     amt -= a;
3861   }else{
3862     offset -= pCur->info.nLocal;
3863   }
3864 
3865   if( rc==SQLITE_OK && amt>0 ){
3866     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
3867     Pgno nextPage;
3868 
3869     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
3870 
3871 #ifndef SQLITE_OMIT_INCRBLOB
3872     /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
3873     ** has not been allocated, allocate it now. The array is sized at
3874     ** one entry for each overflow page in the overflow chain. The
3875     ** page number of the first overflow page is stored in aOverflow[0],
3876     ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3877     ** (the cache is lazily populated).
3878     */
3879     if( pCur->isIncrblobHandle && !pCur->aOverflow ){
3880       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
3881       pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
3882       /* nOvfl is always positive.  If it were zero, fetchPayload would have
3883       ** been used instead of this routine. */
3884       if( ALWAYS(nOvfl) && !pCur->aOverflow ){
3885         rc = SQLITE_NOMEM;
3886       }
3887     }
3888 
3889     /* If the overflow page-list cache has been allocated and the
3890     ** entry for the first required overflow page is valid, skip
3891     ** directly to it.
3892     */
3893     if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3894       iIdx = (offset/ovflSize);
3895       nextPage = pCur->aOverflow[iIdx];
3896       offset = (offset%ovflSize);
3897     }
3898 #endif
3899 
3900     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3901 
3902 #ifndef SQLITE_OMIT_INCRBLOB
3903       /* If required, populate the overflow page-list cache. */
3904       if( pCur->aOverflow ){
3905         assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3906         pCur->aOverflow[iIdx] = nextPage;
3907       }
3908 #endif
3909 
3910       if( offset>=ovflSize ){
3911         /* The only reason to read this page is to obtain the page
3912         ** number for the next page in the overflow chain. The page
3913         ** data is not required. So first try to lookup the overflow
3914         ** page-list cache, if any, then fall back to the getOverflowPage()
3915         ** function.
3916         */
3917 #ifndef SQLITE_OMIT_INCRBLOB
3918         if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3919           nextPage = pCur->aOverflow[iIdx+1];
3920         } else
3921 #endif
3922           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
3923         offset -= ovflSize;
3924       }else{
3925         /* Need to read this page properly. It contains some of the
3926         ** range of data that is being read (eOp==0) or written (eOp!=0).
3927         */
3928         DbPage *pDbPage;
3929         int a = amt;
3930         rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
3931         if( rc==SQLITE_OK ){
3932           aPayload = sqlite3PagerGetData(pDbPage);
3933           nextPage = get4byte(aPayload);
3934           if( a + offset > ovflSize ){
3935             a = ovflSize - offset;
3936           }
3937           rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3938           sqlite3PagerUnref(pDbPage);
3939           offset = 0;
3940           amt -= a;
3941           pBuf += a;
3942         }
3943       }
3944     }
3945   }
3946 
3947   if( rc==SQLITE_OK && amt>0 ){
3948     return SQLITE_CORRUPT_BKPT;
3949   }
3950   return rc;
3951 }
3952 
3953 /*
3954 ** Read part of the key associated with cursor pCur.  Exactly
3955 ** "amt" bytes will be transfered into pBuf[].  The transfer
3956 ** begins at "offset".
3957 **
3958 ** The caller must ensure that pCur is pointing to a valid row
3959 ** in the table.
3960 **
3961 ** Return SQLITE_OK on success or an error code if anything goes
3962 ** wrong.  An error is returned if "offset+amt" is larger than
3963 ** the available payload.
3964 */
sqlite3BtreeKey(BtCursor * pCur,u32 offset,u32 amt,void * pBuf)3965 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3966   assert( cursorHoldsMutex(pCur) );
3967   assert( pCur->eState==CURSOR_VALID );
3968   assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3969   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3970   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
3971 }
3972 
3973 /*
3974 ** Read part of the data associated with cursor pCur.  Exactly
3975 ** "amt" bytes will be transfered into pBuf[].  The transfer
3976 ** begins at "offset".
3977 **
3978 ** Return SQLITE_OK on success or an error code if anything goes
3979 ** wrong.  An error is returned if "offset+amt" is larger than
3980 ** the available payload.
3981 */
sqlite3BtreeData(BtCursor * pCur,u32 offset,u32 amt,void * pBuf)3982 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3983   int rc;
3984 
3985 #ifndef SQLITE_OMIT_INCRBLOB
3986   if ( pCur->eState==CURSOR_INVALID ){
3987     return SQLITE_ABORT;
3988   }
3989 #endif
3990 
3991   assert( cursorHoldsMutex(pCur) );
3992   rc = restoreCursorPosition(pCur);
3993   if( rc==SQLITE_OK ){
3994     assert( pCur->eState==CURSOR_VALID );
3995     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3996     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3997     rc = accessPayload(pCur, offset, amt, pBuf, 0);
3998   }
3999   return rc;
4000 }
4001 
4002 /*
4003 ** Return a pointer to payload information from the entry that the
4004 ** pCur cursor is pointing to.  The pointer is to the beginning of
4005 ** the key if skipKey==0 and it points to the beginning of data if
4006 ** skipKey==1.  The number of bytes of available key/data is written
4007 ** into *pAmt.  If *pAmt==0, then the value returned will not be
4008 ** a valid pointer.
4009 **
4010 ** This routine is an optimization.  It is common for the entire key
4011 ** and data to fit on the local page and for there to be no overflow
4012 ** pages.  When that is so, this routine can be used to access the
4013 ** key and data without making a copy.  If the key and/or data spills
4014 ** onto overflow pages, then accessPayload() must be used to reassemble
4015 ** the key/data and copy it into a preallocated buffer.
4016 **
4017 ** The pointer returned by this routine looks directly into the cached
4018 ** page of the database.  The data might change or move the next time
4019 ** any btree routine is called.
4020 */
fetchPayload(BtCursor * pCur,int * pAmt,int skipKey)4021 static const unsigned char *fetchPayload(
4022   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4023   int *pAmt,           /* Write the number of available bytes here */
4024   int skipKey          /* read beginning at data if this is true */
4025 ){
4026   unsigned char *aPayload;
4027   MemPage *pPage;
4028   u32 nKey;
4029   u32 nLocal;
4030 
4031   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
4032   assert( pCur->eState==CURSOR_VALID );
4033   assert( cursorHoldsMutex(pCur) );
4034   pPage = pCur->apPage[pCur->iPage];
4035   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4036   if( NEVER(pCur->info.nSize==0) ){
4037     btreeParseCell(pCur->apPage[pCur->iPage], pCur->aiIdx[pCur->iPage],
4038                    &pCur->info);
4039   }
4040   aPayload = pCur->info.pCell;
4041   aPayload += pCur->info.nHeader;
4042   if( pPage->intKey ){
4043     nKey = 0;
4044   }else{
4045     nKey = (int)pCur->info.nKey;
4046   }
4047   if( skipKey ){
4048     aPayload += nKey;
4049     nLocal = pCur->info.nLocal - nKey;
4050   }else{
4051     nLocal = pCur->info.nLocal;
4052     assert( nLocal<=nKey );
4053   }
4054   *pAmt = nLocal;
4055   return aPayload;
4056 }
4057 
4058 
4059 /*
4060 ** For the entry that cursor pCur is point to, return as
4061 ** many bytes of the key or data as are available on the local
4062 ** b-tree page.  Write the number of available bytes into *pAmt.
4063 **
4064 ** The pointer returned is ephemeral.  The key/data may move
4065 ** or be destroyed on the next call to any Btree routine,
4066 ** including calls from other threads against the same cache.
4067 ** Hence, a mutex on the BtShared should be held prior to calling
4068 ** this routine.
4069 **
4070 ** These routines is used to get quick access to key and data
4071 ** in the common case where no overflow pages are used.
4072 */
sqlite3BtreeKeyFetch(BtCursor * pCur,int * pAmt)4073 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
4074   const void *p = 0;
4075   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4076   assert( cursorHoldsMutex(pCur) );
4077   if( ALWAYS(pCur->eState==CURSOR_VALID) ){
4078     p = (const void*)fetchPayload(pCur, pAmt, 0);
4079   }
4080   return p;
4081 }
sqlite3BtreeDataFetch(BtCursor * pCur,int * pAmt)4082 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
4083   const void *p = 0;
4084   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4085   assert( cursorHoldsMutex(pCur) );
4086   if( ALWAYS(pCur->eState==CURSOR_VALID) ){
4087     p = (const void*)fetchPayload(pCur, pAmt, 1);
4088   }
4089   return p;
4090 }
4091 
4092 
4093 /*
4094 ** Move the cursor down to a new child page.  The newPgno argument is the
4095 ** page number of the child page to move to.
4096 **
4097 ** This function returns SQLITE_CORRUPT if the page-header flags field of
4098 ** the new child page does not match the flags field of the parent (i.e.
4099 ** if an intkey page appears to be the parent of a non-intkey page, or
4100 ** vice-versa).
4101 */
moveToChild(BtCursor * pCur,u32 newPgno)4102 static int moveToChild(BtCursor *pCur, u32 newPgno){
4103   int rc;
4104   int i = pCur->iPage;
4105   MemPage *pNewPage;
4106   BtShared *pBt = pCur->pBt;
4107 
4108   assert( cursorHoldsMutex(pCur) );
4109   assert( pCur->eState==CURSOR_VALID );
4110   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
4111   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
4112     return SQLITE_CORRUPT_BKPT;
4113   }
4114   rc = getAndInitPage(pBt, newPgno, &pNewPage);
4115   if( rc ) return rc;
4116   pCur->apPage[i+1] = pNewPage;
4117   pCur->aiIdx[i+1] = 0;
4118   pCur->iPage++;
4119 
4120   pCur->info.nSize = 0;
4121   pCur->validNKey = 0;
4122   if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){
4123     return SQLITE_CORRUPT_BKPT;
4124   }
4125   return SQLITE_OK;
4126 }
4127 
4128 #ifndef NDEBUG
4129 /*
4130 ** Page pParent is an internal (non-leaf) tree page. This function
4131 ** asserts that page number iChild is the left-child if the iIdx'th
4132 ** cell in page pParent. Or, if iIdx is equal to the total number of
4133 ** cells in pParent, that page number iChild is the right-child of
4134 ** the page.
4135 */
assertParentIndex(MemPage * pParent,int iIdx,Pgno iChild)4136 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
4137   assert( iIdx<=pParent->nCell );
4138   if( iIdx==pParent->nCell ){
4139     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
4140   }else{
4141     assert( get4byte(findCell(pParent, iIdx))==iChild );
4142   }
4143 }
4144 #else
4145 #  define assertParentIndex(x,y,z)
4146 #endif
4147 
4148 /*
4149 ** Move the cursor up to the parent page.
4150 **
4151 ** pCur->idx is set to the cell index that contains the pointer
4152 ** to the page we are coming from.  If we are coming from the
4153 ** right-most child page then pCur->idx is set to one more than
4154 ** the largest cell index.
4155 */
moveToParent(BtCursor * pCur)4156 static void moveToParent(BtCursor *pCur){
4157   assert( cursorHoldsMutex(pCur) );
4158   assert( pCur->eState==CURSOR_VALID );
4159   assert( pCur->iPage>0 );
4160   assert( pCur->apPage[pCur->iPage] );
4161   assertParentIndex(
4162     pCur->apPage[pCur->iPage-1],
4163     pCur->aiIdx[pCur->iPage-1],
4164     pCur->apPage[pCur->iPage]->pgno
4165   );
4166   releasePage(pCur->apPage[pCur->iPage]);
4167   pCur->iPage--;
4168   pCur->info.nSize = 0;
4169   pCur->validNKey = 0;
4170 }
4171 
4172 /*
4173 ** Move the cursor to point to the root page of its b-tree structure.
4174 **
4175 ** If the table has a virtual root page, then the cursor is moved to point
4176 ** to the virtual root page instead of the actual root page. A table has a
4177 ** virtual root page when the actual root page contains no cells and a
4178 ** single child page. This can only happen with the table rooted at page 1.
4179 **
4180 ** If the b-tree structure is empty, the cursor state is set to
4181 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first
4182 ** cell located on the root (or virtual root) page and the cursor state
4183 ** is set to CURSOR_VALID.
4184 **
4185 ** If this function returns successfully, it may be assumed that the
4186 ** page-header flags indicate that the [virtual] root-page is the expected
4187 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
4188 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
4189 ** indicating a table b-tree, or if the caller did specify a KeyInfo
4190 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
4191 ** b-tree).
4192 */
moveToRoot(BtCursor * pCur)4193 static int moveToRoot(BtCursor *pCur){
4194   MemPage *pRoot;
4195   int rc = SQLITE_OK;
4196   Btree *p = pCur->pBtree;
4197   BtShared *pBt = p->pBt;
4198 
4199   assert( cursorHoldsMutex(pCur) );
4200   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
4201   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
4202   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
4203   if( pCur->eState>=CURSOR_REQUIRESEEK ){
4204     if( pCur->eState==CURSOR_FAULT ){
4205       assert( pCur->skipNext!=SQLITE_OK );
4206       return pCur->skipNext;
4207     }
4208     sqlite3BtreeClearCursor(pCur);
4209   }
4210 
4211   if( pCur->iPage>=0 ){
4212     int i;
4213     for(i=1; i<=pCur->iPage; i++){
4214       releasePage(pCur->apPage[i]);
4215     }
4216     pCur->iPage = 0;
4217   }else{
4218     rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
4219     if( rc!=SQLITE_OK ){
4220       pCur->eState = CURSOR_INVALID;
4221       return rc;
4222     }
4223     pCur->iPage = 0;
4224 
4225     /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
4226     ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
4227     ** NULL, the caller expects a table b-tree. If this is not the case,
4228     ** return an SQLITE_CORRUPT error.  */
4229     assert( pCur->apPage[0]->intKey==1 || pCur->apPage[0]->intKey==0 );
4230     if( (pCur->pKeyInfo==0)!=pCur->apPage[0]->intKey ){
4231       return SQLITE_CORRUPT_BKPT;
4232     }
4233   }
4234 
4235   /* Assert that the root page is of the correct type. This must be the
4236   ** case as the call to this function that loaded the root-page (either
4237   ** this call or a previous invocation) would have detected corruption
4238   ** if the assumption were not true, and it is not possible for the flags
4239   ** byte to have been modified while this cursor is holding a reference
4240   ** to the page.  */
4241   pRoot = pCur->apPage[0];
4242   assert( pRoot->pgno==pCur->pgnoRoot );
4243   assert( pRoot->isInit && (pCur->pKeyInfo==0)==pRoot->intKey );
4244 
4245   pCur->aiIdx[0] = 0;
4246   pCur->info.nSize = 0;
4247   pCur->atLast = 0;
4248   pCur->validNKey = 0;
4249 
4250   if( pRoot->nCell==0 && !pRoot->leaf ){
4251     Pgno subpage;
4252     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
4253     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
4254     pCur->eState = CURSOR_VALID;
4255     rc = moveToChild(pCur, subpage);
4256   }else{
4257     pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
4258   }
4259   return rc;
4260 }
4261 
4262 /*
4263 ** Move the cursor down to the left-most leaf entry beneath the
4264 ** entry to which it is currently pointing.
4265 **
4266 ** The left-most leaf is the one with the smallest key - the first
4267 ** in ascending order.
4268 */
moveToLeftmost(BtCursor * pCur)4269 static int moveToLeftmost(BtCursor *pCur){
4270   Pgno pgno;
4271   int rc = SQLITE_OK;
4272   MemPage *pPage;
4273 
4274   assert( cursorHoldsMutex(pCur) );
4275   assert( pCur->eState==CURSOR_VALID );
4276   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4277     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4278     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
4279     rc = moveToChild(pCur, pgno);
4280   }
4281   return rc;
4282 }
4283 
4284 /*
4285 ** Move the cursor down to the right-most leaf entry beneath the
4286 ** page to which it is currently pointing.  Notice the difference
4287 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
4288 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
4289 ** finds the right-most entry beneath the *page*.
4290 **
4291 ** The right-most entry is the one with the largest key - the last
4292 ** key in ascending order.
4293 */
moveToRightmost(BtCursor * pCur)4294 static int moveToRightmost(BtCursor *pCur){
4295   Pgno pgno;
4296   int rc = SQLITE_OK;
4297   MemPage *pPage = 0;
4298 
4299   assert( cursorHoldsMutex(pCur) );
4300   assert( pCur->eState==CURSOR_VALID );
4301   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4302     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4303     pCur->aiIdx[pCur->iPage] = pPage->nCell;
4304     rc = moveToChild(pCur, pgno);
4305   }
4306   if( rc==SQLITE_OK ){
4307     pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
4308     pCur->info.nSize = 0;
4309     pCur->validNKey = 0;
4310   }
4311   return rc;
4312 }
4313 
4314 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
4315 ** on success.  Set *pRes to 0 if the cursor actually points to something
4316 ** or set *pRes to 1 if the table is empty.
4317 */
sqlite3BtreeFirst(BtCursor * pCur,int * pRes)4318 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
4319   int rc;
4320 
4321   assert( cursorHoldsMutex(pCur) );
4322   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4323   rc = moveToRoot(pCur);
4324   if( rc==SQLITE_OK ){
4325     if( pCur->eState==CURSOR_INVALID ){
4326       assert( pCur->apPage[pCur->iPage]->nCell==0 );
4327       *pRes = 1;
4328     }else{
4329       assert( pCur->apPage[pCur->iPage]->nCell>0 );
4330       *pRes = 0;
4331       rc = moveToLeftmost(pCur);
4332     }
4333   }
4334   return rc;
4335 }
4336 
4337 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
4338 ** on success.  Set *pRes to 0 if the cursor actually points to something
4339 ** or set *pRes to 1 if the table is empty.
4340 */
sqlite3BtreeLast(BtCursor * pCur,int * pRes)4341 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
4342   int rc;
4343 
4344   assert( cursorHoldsMutex(pCur) );
4345   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4346 
4347   /* If the cursor already points to the last entry, this is a no-op. */
4348   if( CURSOR_VALID==pCur->eState && pCur->atLast ){
4349 #ifdef SQLITE_DEBUG
4350     /* This block serves to assert() that the cursor really does point
4351     ** to the last entry in the b-tree. */
4352     int ii;
4353     for(ii=0; ii<pCur->iPage; ii++){
4354       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
4355     }
4356     assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );
4357     assert( pCur->apPage[pCur->iPage]->leaf );
4358 #endif
4359     return SQLITE_OK;
4360   }
4361 
4362   rc = moveToRoot(pCur);
4363   if( rc==SQLITE_OK ){
4364     if( CURSOR_INVALID==pCur->eState ){
4365       assert( pCur->apPage[pCur->iPage]->nCell==0 );
4366       *pRes = 1;
4367     }else{
4368       assert( pCur->eState==CURSOR_VALID );
4369       *pRes = 0;
4370       rc = moveToRightmost(pCur);
4371       pCur->atLast = rc==SQLITE_OK ?1:0;
4372     }
4373   }
4374   return rc;
4375 }
4376 
4377 /* Move the cursor so that it points to an entry near the key
4378 ** specified by pIdxKey or intKey.   Return a success code.
4379 **
4380 ** For INTKEY tables, the intKey parameter is used.  pIdxKey
4381 ** must be NULL.  For index tables, pIdxKey is used and intKey
4382 ** is ignored.
4383 **
4384 ** If an exact match is not found, then the cursor is always
4385 ** left pointing at a leaf page which would hold the entry if it
4386 ** were present.  The cursor might point to an entry that comes
4387 ** before or after the key.
4388 **
4389 ** An integer is written into *pRes which is the result of
4390 ** comparing the key with the entry to which the cursor is
4391 ** pointing.  The meaning of the integer written into
4392 ** *pRes is as follows:
4393 **
4394 **     *pRes<0      The cursor is left pointing at an entry that
4395 **                  is smaller than intKey/pIdxKey or if the table is empty
4396 **                  and the cursor is therefore left point to nothing.
4397 **
4398 **     *pRes==0     The cursor is left pointing at an entry that
4399 **                  exactly matches intKey/pIdxKey.
4400 **
4401 **     *pRes>0      The cursor is left pointing at an entry that
4402 **                  is larger than intKey/pIdxKey.
4403 **
4404 */
sqlite3BtreeMovetoUnpacked(BtCursor * pCur,UnpackedRecord * pIdxKey,i64 intKey,int biasRight,int * pRes)4405 int sqlite3BtreeMovetoUnpacked(
4406   BtCursor *pCur,          /* The cursor to be moved */
4407   UnpackedRecord *pIdxKey, /* Unpacked index key */
4408   i64 intKey,              /* The table key */
4409   int biasRight,           /* If true, bias the search to the high end */
4410   int *pRes                /* Write search results here */
4411 ){
4412   int rc;
4413 
4414   assert( cursorHoldsMutex(pCur) );
4415   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4416   assert( pRes );
4417   assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
4418 
4419   /* If the cursor is already positioned at the point we are trying
4420   ** to move to, then just return without doing any work */
4421   if( pCur->eState==CURSOR_VALID && pCur->validNKey
4422    && pCur->apPage[0]->intKey
4423   ){
4424     if( pCur->info.nKey==intKey ){
4425       *pRes = 0;
4426       return SQLITE_OK;
4427     }
4428     if( pCur->atLast && pCur->info.nKey<intKey ){
4429       *pRes = -1;
4430       return SQLITE_OK;
4431     }
4432   }
4433 
4434   rc = moveToRoot(pCur);
4435   if( rc ){
4436     return rc;
4437   }
4438   assert( pCur->apPage[pCur->iPage] );
4439   assert( pCur->apPage[pCur->iPage]->isInit );
4440   assert( pCur->apPage[pCur->iPage]->nCell>0 || pCur->eState==CURSOR_INVALID );
4441   if( pCur->eState==CURSOR_INVALID ){
4442     *pRes = -1;
4443     assert( pCur->apPage[pCur->iPage]->nCell==0 );
4444     return SQLITE_OK;
4445   }
4446   assert( pCur->apPage[0]->intKey || pIdxKey );
4447   for(;;){
4448     int lwr, upr;
4449     Pgno chldPg;
4450     MemPage *pPage = pCur->apPage[pCur->iPage];
4451     int c;
4452 
4453     /* pPage->nCell must be greater than zero. If this is the root-page
4454     ** the cursor would have been INVALID above and this for(;;) loop
4455     ** not run. If this is not the root-page, then the moveToChild() routine
4456     ** would have already detected db corruption. Similarly, pPage must
4457     ** be the right kind (index or table) of b-tree page. Otherwise
4458     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
4459     assert( pPage->nCell>0 );
4460     assert( pPage->intKey==(pIdxKey==0) );
4461     lwr = 0;
4462     upr = pPage->nCell-1;
4463     if( biasRight ){
4464       pCur->aiIdx[pCur->iPage] = (u16)upr;
4465     }else{
4466       pCur->aiIdx[pCur->iPage] = (u16)((upr+lwr)/2);
4467     }
4468     for(;;){
4469       int idx = pCur->aiIdx[pCur->iPage]; /* Index of current cell in pPage */
4470       u8 *pCell;                          /* Pointer to current cell in pPage */
4471 
4472       pCur->info.nSize = 0;
4473       pCell = findCell(pPage, idx) + pPage->childPtrSize;
4474       if( pPage->intKey ){
4475         i64 nCellKey;
4476         if( pPage->hasData ){
4477           u32 dummy;
4478           pCell += getVarint32(pCell, dummy);
4479         }
4480         getVarint(pCell, (u64*)&nCellKey);
4481         if( nCellKey==intKey ){
4482           c = 0;
4483         }else if( nCellKey<intKey ){
4484           c = -1;
4485         }else{
4486           assert( nCellKey>intKey );
4487           c = +1;
4488         }
4489         pCur->validNKey = 1;
4490         pCur->info.nKey = nCellKey;
4491       }else{
4492         /* The maximum supported page-size is 65536 bytes. This means that
4493         ** the maximum number of record bytes stored on an index B-Tree
4494         ** page is less than 16384 bytes and may be stored as a 2-byte
4495         ** varint. This information is used to attempt to avoid parsing
4496         ** the entire cell by checking for the cases where the record is
4497         ** stored entirely within the b-tree page by inspecting the first
4498         ** 2 bytes of the cell.
4499         */
4500         int nCell = pCell[0];
4501         if( !(nCell & 0x80) && nCell<=pPage->maxLocal ){
4502           /* This branch runs if the record-size field of the cell is a
4503           ** single byte varint and the record fits entirely on the main
4504           ** b-tree page.  */
4505           c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
4506         }else if( !(pCell[1] & 0x80)
4507           && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
4508         ){
4509           /* The record-size field is a 2 byte varint and the record
4510           ** fits entirely on the main b-tree page.  */
4511           c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
4512         }else{
4513           /* The record flows over onto one or more overflow pages. In
4514           ** this case the whole cell needs to be parsed, a buffer allocated
4515           ** and accessPayload() used to retrieve the record into the
4516           ** buffer before VdbeRecordCompare() can be called. */
4517           void *pCellKey;
4518           u8 * const pCellBody = pCell - pPage->childPtrSize;
4519           btreeParseCellPtr(pPage, pCellBody, &pCur->info);
4520           nCell = (int)pCur->info.nKey;
4521           pCellKey = sqlite3Malloc( nCell );
4522           if( pCellKey==0 ){
4523             rc = SQLITE_NOMEM;
4524             goto moveto_finish;
4525           }
4526           rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
4527           if( rc ){
4528             sqlite3_free(pCellKey);
4529             goto moveto_finish;
4530           }
4531           c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
4532           sqlite3_free(pCellKey);
4533         }
4534       }
4535       if( c==0 ){
4536         if( pPage->intKey && !pPage->leaf ){
4537           lwr = idx;
4538           upr = lwr - 1;
4539           break;
4540         }else{
4541           *pRes = 0;
4542           rc = SQLITE_OK;
4543           goto moveto_finish;
4544         }
4545       }
4546       if( c<0 ){
4547         lwr = idx+1;
4548       }else{
4549         upr = idx-1;
4550       }
4551       if( lwr>upr ){
4552         break;
4553       }
4554       pCur->aiIdx[pCur->iPage] = (u16)((lwr+upr)/2);
4555     }
4556     assert( lwr==upr+1 );
4557     assert( pPage->isInit );
4558     if( pPage->leaf ){
4559       chldPg = 0;
4560     }else if( lwr>=pPage->nCell ){
4561       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4562     }else{
4563       chldPg = get4byte(findCell(pPage, lwr));
4564     }
4565     if( chldPg==0 ){
4566       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4567       *pRes = c;
4568       rc = SQLITE_OK;
4569       goto moveto_finish;
4570     }
4571     pCur->aiIdx[pCur->iPage] = (u16)lwr;
4572     pCur->info.nSize = 0;
4573     pCur->validNKey = 0;
4574     rc = moveToChild(pCur, chldPg);
4575     if( rc ) goto moveto_finish;
4576   }
4577 moveto_finish:
4578   return rc;
4579 }
4580 
4581 
4582 /*
4583 ** Return TRUE if the cursor is not pointing at an entry of the table.
4584 **
4585 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
4586 ** past the last entry in the table or sqlite3BtreePrev() moves past
4587 ** the first entry.  TRUE is also returned if the table is empty.
4588 */
sqlite3BtreeEof(BtCursor * pCur)4589 int sqlite3BtreeEof(BtCursor *pCur){
4590   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
4591   ** have been deleted? This API will need to change to return an error code
4592   ** as well as the boolean result value.
4593   */
4594   return (CURSOR_VALID!=pCur->eState);
4595 }
4596 
4597 /*
4598 ** Advance the cursor to the next entry in the database.  If
4599 ** successful then set *pRes=0.  If the cursor
4600 ** was already pointing to the last entry in the database before
4601 ** this routine was called, then set *pRes=1.
4602 */
sqlite3BtreeNext(BtCursor * pCur,int * pRes)4603 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
4604   int rc;
4605   int idx;
4606   MemPage *pPage;
4607 
4608   assert( cursorHoldsMutex(pCur) );
4609   rc = restoreCursorPosition(pCur);
4610   if( rc!=SQLITE_OK ){
4611     return rc;
4612   }
4613   assert( pRes!=0 );
4614   if( CURSOR_INVALID==pCur->eState ){
4615     *pRes = 1;
4616     return SQLITE_OK;
4617   }
4618   if( pCur->skipNext>0 ){
4619     pCur->skipNext = 0;
4620     *pRes = 0;
4621     return SQLITE_OK;
4622   }
4623   pCur->skipNext = 0;
4624 
4625   pPage = pCur->apPage[pCur->iPage];
4626   idx = ++pCur->aiIdx[pCur->iPage];
4627   assert( pPage->isInit );
4628   assert( idx<=pPage->nCell );
4629 
4630   pCur->info.nSize = 0;
4631   pCur->validNKey = 0;
4632   if( idx>=pPage->nCell ){
4633     if( !pPage->leaf ){
4634       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
4635       if( rc ) return rc;
4636       rc = moveToLeftmost(pCur);
4637       *pRes = 0;
4638       return rc;
4639     }
4640     do{
4641       if( pCur->iPage==0 ){
4642         *pRes = 1;
4643         pCur->eState = CURSOR_INVALID;
4644         return SQLITE_OK;
4645       }
4646       moveToParent(pCur);
4647       pPage = pCur->apPage[pCur->iPage];
4648     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
4649     *pRes = 0;
4650     if( pPage->intKey ){
4651       rc = sqlite3BtreeNext(pCur, pRes);
4652     }else{
4653       rc = SQLITE_OK;
4654     }
4655     return rc;
4656   }
4657   *pRes = 0;
4658   if( pPage->leaf ){
4659     return SQLITE_OK;
4660   }
4661   rc = moveToLeftmost(pCur);
4662   return rc;
4663 }
4664 
4665 
4666 /*
4667 ** Step the cursor to the back to the previous entry in the database.  If
4668 ** successful then set *pRes=0.  If the cursor
4669 ** was already pointing to the first entry in the database before
4670 ** this routine was called, then set *pRes=1.
4671 */
sqlite3BtreePrevious(BtCursor * pCur,int * pRes)4672 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
4673   int rc;
4674   MemPage *pPage;
4675 
4676   assert( cursorHoldsMutex(pCur) );
4677   rc = restoreCursorPosition(pCur);
4678   if( rc!=SQLITE_OK ){
4679     return rc;
4680   }
4681   pCur->atLast = 0;
4682   if( CURSOR_INVALID==pCur->eState ){
4683     *pRes = 1;
4684     return SQLITE_OK;
4685   }
4686   if( pCur->skipNext<0 ){
4687     pCur->skipNext = 0;
4688     *pRes = 0;
4689     return SQLITE_OK;
4690   }
4691   pCur->skipNext = 0;
4692 
4693   pPage = pCur->apPage[pCur->iPage];
4694   assert( pPage->isInit );
4695   if( !pPage->leaf ){
4696     int idx = pCur->aiIdx[pCur->iPage];
4697     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
4698     if( rc ){
4699       return rc;
4700     }
4701     rc = moveToRightmost(pCur);
4702   }else{
4703     while( pCur->aiIdx[pCur->iPage]==0 ){
4704       if( pCur->iPage==0 ){
4705         pCur->eState = CURSOR_INVALID;
4706         *pRes = 1;
4707         return SQLITE_OK;
4708       }
4709       moveToParent(pCur);
4710     }
4711     pCur->info.nSize = 0;
4712     pCur->validNKey = 0;
4713 
4714     pCur->aiIdx[pCur->iPage]--;
4715     pPage = pCur->apPage[pCur->iPage];
4716     if( pPage->intKey && !pPage->leaf ){
4717       rc = sqlite3BtreePrevious(pCur, pRes);
4718     }else{
4719       rc = SQLITE_OK;
4720     }
4721   }
4722   *pRes = 0;
4723   return rc;
4724 }
4725 
4726 /*
4727 ** Allocate a new page from the database file.
4728 **
4729 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
4730 ** has already been called on the new page.)  The new page has also
4731 ** been referenced and the calling routine is responsible for calling
4732 ** sqlite3PagerUnref() on the new page when it is done.
4733 **
4734 ** SQLITE_OK is returned on success.  Any other return value indicates
4735 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
4736 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
4737 **
4738 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
4739 ** locate a page close to the page number "nearby".  This can be used in an
4740 ** attempt to keep related pages close to each other in the database file,
4741 ** which in turn can make database access faster.
4742 **
4743 ** If the "exact" parameter is not 0, and the page-number nearby exists
4744 ** anywhere on the free-list, then it is guarenteed to be returned. This
4745 ** is only used by auto-vacuum databases when allocating a new table.
4746 */
allocateBtreePage(BtShared * pBt,MemPage ** ppPage,Pgno * pPgno,Pgno nearby,u8 exact)4747 static int allocateBtreePage(
4748   BtShared *pBt,
4749   MemPage **ppPage,
4750   Pgno *pPgno,
4751   Pgno nearby,
4752   u8 exact
4753 ){
4754   MemPage *pPage1;
4755   int rc;
4756   u32 n;     /* Number of pages on the freelist */
4757   u32 k;     /* Number of leaves on the trunk of the freelist */
4758   MemPage *pTrunk = 0;
4759   MemPage *pPrevTrunk = 0;
4760   Pgno mxPage;     /* Total size of the database file */
4761 
4762   assert( sqlite3_mutex_held(pBt->mutex) );
4763   pPage1 = pBt->pPage1;
4764   mxPage = btreePagecount(pBt);
4765   n = get4byte(&pPage1->aData[36]);
4766   testcase( n==mxPage-1 );
4767   if( n>=mxPage ){
4768     return SQLITE_CORRUPT_BKPT;
4769   }
4770   if( n>0 ){
4771     /* There are pages on the freelist.  Reuse one of those pages. */
4772     Pgno iTrunk;
4773     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
4774 
4775     /* If the 'exact' parameter was true and a query of the pointer-map
4776     ** shows that the page 'nearby' is somewhere on the free-list, then
4777     ** the entire-list will be searched for that page.
4778     */
4779 #ifndef SQLITE_OMIT_AUTOVACUUM
4780     if( exact && nearby<=mxPage ){
4781       u8 eType;
4782       assert( nearby>0 );
4783       assert( pBt->autoVacuum );
4784       rc = ptrmapGet(pBt, nearby, &eType, 0);
4785       if( rc ) return rc;
4786       if( eType==PTRMAP_FREEPAGE ){
4787         searchList = 1;
4788       }
4789       *pPgno = nearby;
4790     }
4791 #endif
4792 
4793     /* Decrement the free-list count by 1. Set iTrunk to the index of the
4794     ** first free-list trunk page. iPrevTrunk is initially 1.
4795     */
4796     rc = sqlite3PagerWrite(pPage1->pDbPage);
4797     if( rc ) return rc;
4798     put4byte(&pPage1->aData[36], n-1);
4799 
4800     /* The code within this loop is run only once if the 'searchList' variable
4801     ** is not true. Otherwise, it runs once for each trunk-page on the
4802     ** free-list until the page 'nearby' is located.
4803     */
4804     do {
4805       pPrevTrunk = pTrunk;
4806       if( pPrevTrunk ){
4807         iTrunk = get4byte(&pPrevTrunk->aData[0]);
4808       }else{
4809         iTrunk = get4byte(&pPage1->aData[32]);
4810       }
4811       testcase( iTrunk==mxPage );
4812       if( iTrunk>mxPage ){
4813         rc = SQLITE_CORRUPT_BKPT;
4814       }else{
4815         rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
4816       }
4817       if( rc ){
4818         pTrunk = 0;
4819         goto end_allocate_page;
4820       }
4821 
4822       k = get4byte(&pTrunk->aData[4]); /* # of leaves on this trunk page */
4823       if( k==0 && !searchList ){
4824         /* The trunk has no leaves and the list is not being searched.
4825         ** So extract the trunk page itself and use it as the newly
4826         ** allocated page */
4827         assert( pPrevTrunk==0 );
4828         rc = sqlite3PagerWrite(pTrunk->pDbPage);
4829         if( rc ){
4830           goto end_allocate_page;
4831         }
4832         *pPgno = iTrunk;
4833         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4834         *ppPage = pTrunk;
4835         pTrunk = 0;
4836         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4837       }else if( k>(u32)(pBt->usableSize/4 - 2) ){
4838         /* Value of k is out of range.  Database corruption */
4839         rc = SQLITE_CORRUPT_BKPT;
4840         goto end_allocate_page;
4841 #ifndef SQLITE_OMIT_AUTOVACUUM
4842       }else if( searchList && nearby==iTrunk ){
4843         /* The list is being searched and this trunk page is the page
4844         ** to allocate, regardless of whether it has leaves.
4845         */
4846         assert( *pPgno==iTrunk );
4847         *ppPage = pTrunk;
4848         searchList = 0;
4849         rc = sqlite3PagerWrite(pTrunk->pDbPage);
4850         if( rc ){
4851           goto end_allocate_page;
4852         }
4853         if( k==0 ){
4854           if( !pPrevTrunk ){
4855             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4856           }else{
4857             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4858             if( rc!=SQLITE_OK ){
4859               goto end_allocate_page;
4860             }
4861             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4862           }
4863         }else{
4864           /* The trunk page is required by the caller but it contains
4865           ** pointers to free-list leaves. The first leaf becomes a trunk
4866           ** page in this case.
4867           */
4868           MemPage *pNewTrunk;
4869           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
4870           if( iNewTrunk>mxPage ){
4871             rc = SQLITE_CORRUPT_BKPT;
4872             goto end_allocate_page;
4873           }
4874           testcase( iNewTrunk==mxPage );
4875           rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
4876           if( rc!=SQLITE_OK ){
4877             goto end_allocate_page;
4878           }
4879           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
4880           if( rc!=SQLITE_OK ){
4881             releasePage(pNewTrunk);
4882             goto end_allocate_page;
4883           }
4884           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4885           put4byte(&pNewTrunk->aData[4], k-1);
4886           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
4887           releasePage(pNewTrunk);
4888           if( !pPrevTrunk ){
4889             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
4890             put4byte(&pPage1->aData[32], iNewTrunk);
4891           }else{
4892             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4893             if( rc ){
4894               goto end_allocate_page;
4895             }
4896             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4897           }
4898         }
4899         pTrunk = 0;
4900         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4901 #endif
4902       }else if( k>0 ){
4903         /* Extract a leaf from the trunk */
4904         u32 closest;
4905         Pgno iPage;
4906         unsigned char *aData = pTrunk->aData;
4907         if( nearby>0 ){
4908           u32 i;
4909           int dist;
4910           closest = 0;
4911           dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
4912           for(i=1; i<k; i++){
4913             int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
4914             if( d2<dist ){
4915               closest = i;
4916               dist = d2;
4917             }
4918           }
4919         }else{
4920           closest = 0;
4921         }
4922 
4923         iPage = get4byte(&aData[8+closest*4]);
4924         testcase( iPage==mxPage );
4925         if( iPage>mxPage ){
4926           rc = SQLITE_CORRUPT_BKPT;
4927           goto end_allocate_page;
4928         }
4929         testcase( iPage==mxPage );
4930         if( !searchList || iPage==nearby ){
4931           int noContent;
4932           *pPgno = iPage;
4933           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4934                  ": %d more free pages\n",
4935                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
4936           rc = sqlite3PagerWrite(pTrunk->pDbPage);
4937           if( rc ) goto end_allocate_page;
4938           if( closest<k-1 ){
4939             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4940           }
4941           put4byte(&aData[4], k-1);
4942           noContent = !btreeGetHasContent(pBt, *pPgno);
4943           rc = btreeGetPage(pBt, *pPgno, ppPage, noContent);
4944           if( rc==SQLITE_OK ){
4945             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4946             if( rc!=SQLITE_OK ){
4947               releasePage(*ppPage);
4948             }
4949           }
4950           searchList = 0;
4951         }
4952       }
4953       releasePage(pPrevTrunk);
4954       pPrevTrunk = 0;
4955     }while( searchList );
4956   }else{
4957     /* There are no pages on the freelist, so create a new page at the
4958     ** end of the file */
4959     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
4960     if( rc ) return rc;
4961     pBt->nPage++;
4962     if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
4963 
4964 #ifndef SQLITE_OMIT_AUTOVACUUM
4965     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
4966       /* If *pPgno refers to a pointer-map page, allocate two new pages
4967       ** at the end of the file instead of one. The first allocated page
4968       ** becomes a new pointer-map page, the second is used by the caller.
4969       */
4970       MemPage *pPg = 0;
4971       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
4972       assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
4973       rc = btreeGetPage(pBt, pBt->nPage, &pPg, 1);
4974       if( rc==SQLITE_OK ){
4975         rc = sqlite3PagerWrite(pPg->pDbPage);
4976         releasePage(pPg);
4977       }
4978       if( rc ) return rc;
4979       pBt->nPage++;
4980       if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
4981     }
4982 #endif
4983     put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
4984     *pPgno = pBt->nPage;
4985 
4986     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4987     rc = btreeGetPage(pBt, *pPgno, ppPage, 1);
4988     if( rc ) return rc;
4989     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4990     if( rc!=SQLITE_OK ){
4991       releasePage(*ppPage);
4992     }
4993     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
4994   }
4995 
4996   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4997 
4998 end_allocate_page:
4999   releasePage(pTrunk);
5000   releasePage(pPrevTrunk);
5001   if( rc==SQLITE_OK ){
5002     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
5003       releasePage(*ppPage);
5004       return SQLITE_CORRUPT_BKPT;
5005     }
5006     (*ppPage)->isInit = 0;
5007   }else{
5008     *ppPage = 0;
5009   }
5010   assert( rc!=SQLITE_OK || sqlite3PagerIswriteable((*ppPage)->pDbPage) );
5011   return rc;
5012 }
5013 
5014 /*
5015 ** This function is used to add page iPage to the database file free-list.
5016 ** It is assumed that the page is not already a part of the free-list.
5017 **
5018 ** The value passed as the second argument to this function is optional.
5019 ** If the caller happens to have a pointer to the MemPage object
5020 ** corresponding to page iPage handy, it may pass it as the second value.
5021 ** Otherwise, it may pass NULL.
5022 **
5023 ** If a pointer to a MemPage object is passed as the second argument,
5024 ** its reference count is not altered by this function.
5025 */
freePage2(BtShared * pBt,MemPage * pMemPage,Pgno iPage)5026 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
5027   MemPage *pTrunk = 0;                /* Free-list trunk page */
5028   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
5029   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
5030   MemPage *pPage;                     /* Page being freed. May be NULL. */
5031   int rc;                             /* Return Code */
5032   int nFree;                          /* Initial number of pages on free-list */
5033 
5034   assert( sqlite3_mutex_held(pBt->mutex) );
5035   assert( iPage>1 );
5036   assert( !pMemPage || pMemPage->pgno==iPage );
5037 
5038   if( pMemPage ){
5039     pPage = pMemPage;
5040     sqlite3PagerRef(pPage->pDbPage);
5041   }else{
5042     pPage = btreePageLookup(pBt, iPage);
5043   }
5044 
5045   /* Increment the free page count on pPage1 */
5046   rc = sqlite3PagerWrite(pPage1->pDbPage);
5047   if( rc ) goto freepage_out;
5048   nFree = get4byte(&pPage1->aData[36]);
5049   put4byte(&pPage1->aData[36], nFree+1);
5050 
5051   if( pBt->secureDelete ){
5052     /* If the secure_delete option is enabled, then
5053     ** always fully overwrite deleted information with zeros.
5054     */
5055     if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
5056      ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
5057     ){
5058       goto freepage_out;
5059     }
5060     memset(pPage->aData, 0, pPage->pBt->pageSize);
5061   }
5062 
5063   /* If the database supports auto-vacuum, write an entry in the pointer-map
5064   ** to indicate that the page is free.
5065   */
5066   if( ISAUTOVACUUM ){
5067     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
5068     if( rc ) goto freepage_out;
5069   }
5070 
5071   /* Now manipulate the actual database free-list structure. There are two
5072   ** possibilities. If the free-list is currently empty, or if the first
5073   ** trunk page in the free-list is full, then this page will become a
5074   ** new free-list trunk page. Otherwise, it will become a leaf of the
5075   ** first trunk page in the current free-list. This block tests if it
5076   ** is possible to add the page as a new free-list leaf.
5077   */
5078   if( nFree!=0 ){
5079     u32 nLeaf;                /* Initial number of leaf cells on trunk page */
5080 
5081     iTrunk = get4byte(&pPage1->aData[32]);
5082     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
5083     if( rc!=SQLITE_OK ){
5084       goto freepage_out;
5085     }
5086 
5087     nLeaf = get4byte(&pTrunk->aData[4]);
5088     assert( pBt->usableSize>32 );
5089     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
5090       rc = SQLITE_CORRUPT_BKPT;
5091       goto freepage_out;
5092     }
5093     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
5094       /* In this case there is room on the trunk page to insert the page
5095       ** being freed as a new leaf.
5096       **
5097       ** Note that the trunk page is not really full until it contains
5098       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
5099       ** coded.  But due to a coding error in versions of SQLite prior to
5100       ** 3.6.0, databases with freelist trunk pages holding more than
5101       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
5102       ** to maintain backwards compatibility with older versions of SQLite,
5103       ** we will continue to restrict the number of entries to usableSize/4 - 8
5104       ** for now.  At some point in the future (once everyone has upgraded
5105       ** to 3.6.0 or later) we should consider fixing the conditional above
5106       ** to read "usableSize/4-2" instead of "usableSize/4-8".
5107       */
5108       rc = sqlite3PagerWrite(pTrunk->pDbPage);
5109       if( rc==SQLITE_OK ){
5110         put4byte(&pTrunk->aData[4], nLeaf+1);
5111         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
5112         if( pPage && !pBt->secureDelete ){
5113           sqlite3PagerDontWrite(pPage->pDbPage);
5114         }
5115         rc = btreeSetHasContent(pBt, iPage);
5116       }
5117       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
5118       goto freepage_out;
5119     }
5120   }
5121 
5122   /* If control flows to this point, then it was not possible to add the
5123   ** the page being freed as a leaf page of the first trunk in the free-list.
5124   ** Possibly because the free-list is empty, or possibly because the
5125   ** first trunk in the free-list is full. Either way, the page being freed
5126   ** will become the new first trunk page in the free-list.
5127   */
5128   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
5129     goto freepage_out;
5130   }
5131   rc = sqlite3PagerWrite(pPage->pDbPage);
5132   if( rc!=SQLITE_OK ){
5133     goto freepage_out;
5134   }
5135   put4byte(pPage->aData, iTrunk);
5136   put4byte(&pPage->aData[4], 0);
5137   put4byte(&pPage1->aData[32], iPage);
5138   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
5139 
5140 freepage_out:
5141   if( pPage ){
5142     pPage->isInit = 0;
5143   }
5144   releasePage(pPage);
5145   releasePage(pTrunk);
5146   return rc;
5147 }
freePage(MemPage * pPage,int * pRC)5148 static void freePage(MemPage *pPage, int *pRC){
5149   if( (*pRC)==SQLITE_OK ){
5150     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
5151   }
5152 }
5153 
5154 /*
5155 ** Free any overflow pages associated with the given Cell.
5156 */
clearCell(MemPage * pPage,unsigned char * pCell)5157 static int clearCell(MemPage *pPage, unsigned char *pCell){
5158   BtShared *pBt = pPage->pBt;
5159   CellInfo info;
5160   Pgno ovflPgno;
5161   int rc;
5162   int nOvfl;
5163   u32 ovflPageSize;
5164 
5165   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5166   btreeParseCellPtr(pPage, pCell, &info);
5167   if( info.iOverflow==0 ){
5168     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
5169   }
5170   ovflPgno = get4byte(&pCell[info.iOverflow]);
5171   assert( pBt->usableSize > 4 );
5172   ovflPageSize = pBt->usableSize - 4;
5173   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
5174   assert( ovflPgno==0 || nOvfl>0 );
5175   while( nOvfl-- ){
5176     Pgno iNext = 0;
5177     MemPage *pOvfl = 0;
5178     if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
5179       /* 0 is not a legal page number and page 1 cannot be an
5180       ** overflow page. Therefore if ovflPgno<2 or past the end of the
5181       ** file the database must be corrupt. */
5182       return SQLITE_CORRUPT_BKPT;
5183     }
5184     if( nOvfl ){
5185       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
5186       if( rc ) return rc;
5187     }
5188 
5189     if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
5190      && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
5191     ){
5192       /* There is no reason any cursor should have an outstanding reference
5193       ** to an overflow page belonging to a cell that is being deleted/updated.
5194       ** So if there exists more than one reference to this page, then it
5195       ** must not really be an overflow page and the database must be corrupt.
5196       ** It is helpful to detect this before calling freePage2(), as
5197       ** freePage2() may zero the page contents if secure-delete mode is
5198       ** enabled. If this 'overflow' page happens to be a page that the
5199       ** caller is iterating through or using in some other way, this
5200       ** can be problematic.
5201       */
5202       rc = SQLITE_CORRUPT_BKPT;
5203     }else{
5204       rc = freePage2(pBt, pOvfl, ovflPgno);
5205     }
5206 
5207     if( pOvfl ){
5208       sqlite3PagerUnref(pOvfl->pDbPage);
5209     }
5210     if( rc ) return rc;
5211     ovflPgno = iNext;
5212   }
5213   return SQLITE_OK;
5214 }
5215 
5216 /*
5217 ** Create the byte sequence used to represent a cell on page pPage
5218 ** and write that byte sequence into pCell[].  Overflow pages are
5219 ** allocated and filled in as necessary.  The calling procedure
5220 ** is responsible for making sure sufficient space has been allocated
5221 ** for pCell[].
5222 **
5223 ** Note that pCell does not necessary need to point to the pPage->aData
5224 ** area.  pCell might point to some temporary storage.  The cell will
5225 ** be constructed in this temporary area then copied into pPage->aData
5226 ** later.
5227 */
fillInCell(MemPage * pPage,unsigned char * pCell,const void * pKey,i64 nKey,const void * pData,int nData,int nZero,int * pnSize)5228 static int fillInCell(
5229   MemPage *pPage,                /* The page that contains the cell */
5230   unsigned char *pCell,          /* Complete text of the cell */
5231   const void *pKey, i64 nKey,    /* The key */
5232   const void *pData,int nData,   /* The data */
5233   int nZero,                     /* Extra zero bytes to append to pData */
5234   int *pnSize                    /* Write cell size here */
5235 ){
5236   int nPayload;
5237   const u8 *pSrc;
5238   int nSrc, n, rc;
5239   int spaceLeft;
5240   MemPage *pOvfl = 0;
5241   MemPage *pToRelease = 0;
5242   unsigned char *pPrior;
5243   unsigned char *pPayload;
5244   BtShared *pBt = pPage->pBt;
5245   Pgno pgnoOvfl = 0;
5246   int nHeader;
5247   CellInfo info;
5248 
5249   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5250 
5251   /* pPage is not necessarily writeable since pCell might be auxiliary
5252   ** buffer space that is separate from the pPage buffer area */
5253   assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
5254             || sqlite3PagerIswriteable(pPage->pDbPage) );
5255 
5256   /* Fill in the header. */
5257   nHeader = 0;
5258   if( !pPage->leaf ){
5259     nHeader += 4;
5260   }
5261   if( pPage->hasData ){
5262     nHeader += putVarint(&pCell[nHeader], nData+nZero);
5263   }else{
5264     nData = nZero = 0;
5265   }
5266   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
5267   btreeParseCellPtr(pPage, pCell, &info);
5268   assert( info.nHeader==nHeader );
5269   assert( info.nKey==nKey );
5270   assert( info.nData==(u32)(nData+nZero) );
5271 
5272   /* Fill in the payload */
5273   nPayload = nData + nZero;
5274   if( pPage->intKey ){
5275     pSrc = pData;
5276     nSrc = nData;
5277     nData = 0;
5278   }else{
5279     if( NEVER(nKey>0x7fffffff || pKey==0) ){
5280       return SQLITE_CORRUPT_BKPT;
5281     }
5282     nPayload += (int)nKey;
5283     pSrc = pKey;
5284     nSrc = (int)nKey;
5285   }
5286   *pnSize = info.nSize;
5287   spaceLeft = info.nLocal;
5288   pPayload = &pCell[nHeader];
5289   pPrior = &pCell[info.iOverflow];
5290 
5291   while( nPayload>0 ){
5292     if( spaceLeft==0 ){
5293 #ifndef SQLITE_OMIT_AUTOVACUUM
5294       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
5295       if( pBt->autoVacuum ){
5296         do{
5297           pgnoOvfl++;
5298         } while(
5299           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
5300         );
5301       }
5302 #endif
5303       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
5304 #ifndef SQLITE_OMIT_AUTOVACUUM
5305       /* If the database supports auto-vacuum, and the second or subsequent
5306       ** overflow page is being allocated, add an entry to the pointer-map
5307       ** for that page now.
5308       **
5309       ** If this is the first overflow page, then write a partial entry
5310       ** to the pointer-map. If we write nothing to this pointer-map slot,
5311       ** then the optimistic overflow chain processing in clearCell()
5312       ** may misinterpret the uninitialised values and delete the
5313       ** wrong pages from the database.
5314       */
5315       if( pBt->autoVacuum && rc==SQLITE_OK ){
5316         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
5317         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
5318         if( rc ){
5319           releasePage(pOvfl);
5320         }
5321       }
5322 #endif
5323       if( rc ){
5324         releasePage(pToRelease);
5325         return rc;
5326       }
5327 
5328       /* If pToRelease is not zero than pPrior points into the data area
5329       ** of pToRelease.  Make sure pToRelease is still writeable. */
5330       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5331 
5332       /* If pPrior is part of the data area of pPage, then make sure pPage
5333       ** is still writeable */
5334       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
5335             || sqlite3PagerIswriteable(pPage->pDbPage) );
5336 
5337       put4byte(pPrior, pgnoOvfl);
5338       releasePage(pToRelease);
5339       pToRelease = pOvfl;
5340       pPrior = pOvfl->aData;
5341       put4byte(pPrior, 0);
5342       pPayload = &pOvfl->aData[4];
5343       spaceLeft = pBt->usableSize - 4;
5344     }
5345     n = nPayload;
5346     if( n>spaceLeft ) n = spaceLeft;
5347 
5348     /* If pToRelease is not zero than pPayload points into the data area
5349     ** of pToRelease.  Make sure pToRelease is still writeable. */
5350     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5351 
5352     /* If pPayload is part of the data area of pPage, then make sure pPage
5353     ** is still writeable */
5354     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
5355             || sqlite3PagerIswriteable(pPage->pDbPage) );
5356 
5357     if( nSrc>0 ){
5358       if( n>nSrc ) n = nSrc;
5359       assert( pSrc );
5360       memcpy(pPayload, pSrc, n);
5361     }else{
5362       memset(pPayload, 0, n);
5363     }
5364     nPayload -= n;
5365     pPayload += n;
5366     pSrc += n;
5367     nSrc -= n;
5368     spaceLeft -= n;
5369     if( nSrc==0 ){
5370       nSrc = nData;
5371       pSrc = pData;
5372     }
5373   }
5374   releasePage(pToRelease);
5375   return SQLITE_OK;
5376 }
5377 
5378 /*
5379 ** Remove the i-th cell from pPage.  This routine effects pPage only.
5380 ** The cell content is not freed or deallocated.  It is assumed that
5381 ** the cell content has been copied someplace else.  This routine just
5382 ** removes the reference to the cell from pPage.
5383 **
5384 ** "sz" must be the number of bytes in the cell.
5385 */
dropCell(MemPage * pPage,int idx,int sz,int * pRC)5386 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
5387   int i;          /* Loop counter */
5388   u32 pc;         /* Offset to cell content of cell being deleted */
5389   u8 *data;       /* pPage->aData */
5390   u8 *ptr;        /* Used to move bytes around within data[] */
5391   int rc;         /* The return code */
5392   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
5393 
5394   if( *pRC ) return;
5395 
5396   assert( idx>=0 && idx<pPage->nCell );
5397   assert( sz==cellSize(pPage, idx) );
5398   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5399   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5400   data = pPage->aData;
5401   ptr = &data[pPage->cellOffset + 2*idx];
5402   pc = get2byte(ptr);
5403   hdr = pPage->hdrOffset;
5404   testcase( pc==get2byte(&data[hdr+5]) );
5405   testcase( pc+sz==pPage->pBt->usableSize );
5406   if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){
5407     *pRC = SQLITE_CORRUPT_BKPT;
5408     return;
5409   }
5410   rc = freeSpace(pPage, pc, sz);
5411   if( rc ){
5412     *pRC = rc;
5413     return;
5414   }
5415   for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
5416     ptr[0] = ptr[2];
5417     ptr[1] = ptr[3];
5418   }
5419   pPage->nCell--;
5420   put2byte(&data[hdr+3], pPage->nCell);
5421   pPage->nFree += 2;
5422 }
5423 
5424 /*
5425 ** Insert a new cell on pPage at cell index "i".  pCell points to the
5426 ** content of the cell.
5427 **
5428 ** If the cell content will fit on the page, then put it there.  If it
5429 ** will not fit, then make a copy of the cell content into pTemp if
5430 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
5431 ** in pPage->aOvfl[] and make it point to the cell content (either
5432 ** in pTemp or the original pCell) and also record its index.
5433 ** Allocating a new entry in pPage->aCell[] implies that
5434 ** pPage->nOverflow is incremented.
5435 **
5436 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
5437 ** cell. The caller will overwrite them after this function returns. If
5438 ** nSkip is non-zero, then pCell may not point to an invalid memory location
5439 ** (but pCell+nSkip is always valid).
5440 */
insertCell(MemPage * pPage,int i,u8 * pCell,int sz,u8 * pTemp,Pgno iChild,int * pRC)5441 static void insertCell(
5442   MemPage *pPage,   /* Page into which we are copying */
5443   int i,            /* New cell becomes the i-th cell of the page */
5444   u8 *pCell,        /* Content of the new cell */
5445   int sz,           /* Bytes of content in pCell */
5446   u8 *pTemp,        /* Temp storage space for pCell, if needed */
5447   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
5448   int *pRC          /* Read and write return code from here */
5449 ){
5450   int idx = 0;      /* Where to write new cell content in data[] */
5451   int j;            /* Loop counter */
5452   int end;          /* First byte past the last cell pointer in data[] */
5453   int ins;          /* Index in data[] where new cell pointer is inserted */
5454   int cellOffset;   /* Address of first cell pointer in data[] */
5455   u8 *data;         /* The content of the whole page */
5456   u8 *ptr;          /* Used for moving information around in data[] */
5457 
5458   int nSkip = (iChild ? 4 : 0);
5459 
5460   if( *pRC ) return;
5461 
5462   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
5463   assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=10921 );
5464   assert( pPage->nOverflow<=ArraySize(pPage->aOvfl) );
5465   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5466   /* The cell should normally be sized correctly.  However, when moving a
5467   ** malformed cell from a leaf page to an interior page, if the cell size
5468   ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
5469   ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
5470   ** the term after the || in the following assert(). */
5471   assert( sz==cellSizePtr(pPage, pCell) || (sz==8 && iChild>0) );
5472   if( pPage->nOverflow || sz+2>pPage->nFree ){
5473     if( pTemp ){
5474       memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
5475       pCell = pTemp;
5476     }
5477     if( iChild ){
5478       put4byte(pCell, iChild);
5479     }
5480     j = pPage->nOverflow++;
5481     assert( j<(int)(sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0])) );
5482     pPage->aOvfl[j].pCell = pCell;
5483     pPage->aOvfl[j].idx = (u16)i;
5484   }else{
5485     int rc = sqlite3PagerWrite(pPage->pDbPage);
5486     if( rc!=SQLITE_OK ){
5487       *pRC = rc;
5488       return;
5489     }
5490     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5491     data = pPage->aData;
5492     cellOffset = pPage->cellOffset;
5493     end = cellOffset + 2*pPage->nCell;
5494     ins = cellOffset + 2*i;
5495     rc = allocateSpace(pPage, sz, &idx);
5496     if( rc ){ *pRC = rc; return; }
5497     /* The allocateSpace() routine guarantees the following two properties
5498     ** if it returns success */
5499     assert( idx >= end+2 );
5500     assert( idx+sz <= (int)pPage->pBt->usableSize );
5501     pPage->nCell++;
5502     pPage->nFree -= (u16)(2 + sz);
5503     memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
5504     if( iChild ){
5505       put4byte(&data[idx], iChild);
5506     }
5507     for(j=end, ptr=&data[j]; j>ins; j-=2, ptr-=2){
5508       ptr[0] = ptr[-2];
5509       ptr[1] = ptr[-1];
5510     }
5511     put2byte(&data[ins], idx);
5512     put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
5513 #ifndef SQLITE_OMIT_AUTOVACUUM
5514     if( pPage->pBt->autoVacuum ){
5515       /* The cell may contain a pointer to an overflow page. If so, write
5516       ** the entry for the overflow page into the pointer map.
5517       */
5518       ptrmapPutOvflPtr(pPage, pCell, pRC);
5519     }
5520 #endif
5521   }
5522 }
5523 
5524 /*
5525 ** Add a list of cells to a page.  The page should be initially empty.
5526 ** The cells are guaranteed to fit on the page.
5527 */
assemblePage(MemPage * pPage,int nCell,u8 ** apCell,u16 * aSize)5528 static void assemblePage(
5529   MemPage *pPage,   /* The page to be assemblied */
5530   int nCell,        /* The number of cells to add to this page */
5531   u8 **apCell,      /* Pointers to cell bodies */
5532   u16 *aSize        /* Sizes of the cells */
5533 ){
5534   int i;            /* Loop counter */
5535   u8 *pCellptr;     /* Address of next cell pointer */
5536   int cellbody;     /* Address of next cell body */
5537   u8 * const data = pPage->aData;             /* Pointer to data for pPage */
5538   const int hdr = pPage->hdrOffset;           /* Offset of header on pPage */
5539   const int nUsable = pPage->pBt->usableSize; /* Usable size of page */
5540 
5541   assert( pPage->nOverflow==0 );
5542   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5543   assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt)
5544             && (int)MX_CELL(pPage->pBt)<=10921);
5545   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5546 
5547   /* Check that the page has just been zeroed by zeroPage() */
5548   assert( pPage->nCell==0 );
5549   assert( get2byteNotZero(&data[hdr+5])==nUsable );
5550 
5551   pCellptr = &data[pPage->cellOffset + nCell*2];
5552   cellbody = nUsable;
5553   for(i=nCell-1; i>=0; i--){
5554     pCellptr -= 2;
5555     cellbody -= aSize[i];
5556     put2byte(pCellptr, cellbody);
5557     memcpy(&data[cellbody], apCell[i], aSize[i]);
5558   }
5559   put2byte(&data[hdr+3], nCell);
5560   put2byte(&data[hdr+5], cellbody);
5561   pPage->nFree -= (nCell*2 + nUsable - cellbody);
5562   pPage->nCell = (u16)nCell;
5563 }
5564 
5565 /*
5566 ** The following parameters determine how many adjacent pages get involved
5567 ** in a balancing operation.  NN is the number of neighbors on either side
5568 ** of the page that participate in the balancing operation.  NB is the
5569 ** total number of pages that participate, including the target page and
5570 ** NN neighbors on either side.
5571 **
5572 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
5573 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
5574 ** in exchange for a larger degradation in INSERT and UPDATE performance.
5575 ** The value of NN appears to give the best results overall.
5576 */
5577 #define NN 1             /* Number of neighbors on either side of pPage */
5578 #define NB (NN*2+1)      /* Total pages involved in the balance */
5579 
5580 
5581 #ifndef SQLITE_OMIT_QUICKBALANCE
5582 /*
5583 ** This version of balance() handles the common special case where
5584 ** a new entry is being inserted on the extreme right-end of the
5585 ** tree, in other words, when the new entry will become the largest
5586 ** entry in the tree.
5587 **
5588 ** Instead of trying to balance the 3 right-most leaf pages, just add
5589 ** a new page to the right-hand side and put the one new entry in
5590 ** that page.  This leaves the right side of the tree somewhat
5591 ** unbalanced.  But odds are that we will be inserting new entries
5592 ** at the end soon afterwards so the nearly empty page will quickly
5593 ** fill up.  On average.
5594 **
5595 ** pPage is the leaf page which is the right-most page in the tree.
5596 ** pParent is its parent.  pPage must have a single overflow entry
5597 ** which is also the right-most entry on the page.
5598 **
5599 ** The pSpace buffer is used to store a temporary copy of the divider
5600 ** cell that will be inserted into pParent. Such a cell consists of a 4
5601 ** byte page number followed by a variable length integer. In other
5602 ** words, at most 13 bytes. Hence the pSpace buffer must be at
5603 ** least 13 bytes in size.
5604 */
balance_quick(MemPage * pParent,MemPage * pPage,u8 * pSpace)5605 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
5606   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
5607   MemPage *pNew;                       /* Newly allocated page */
5608   int rc;                              /* Return Code */
5609   Pgno pgnoNew;                        /* Page number of pNew */
5610 
5611   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5612   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5613   assert( pPage->nOverflow==1 );
5614 
5615   /* This error condition is now caught prior to reaching this function */
5616   if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT;
5617 
5618   /* Allocate a new page. This page will become the right-sibling of
5619   ** pPage. Make the parent page writable, so that the new divider cell
5620   ** may be inserted. If both these operations are successful, proceed.
5621   */
5622   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
5623 
5624   if( rc==SQLITE_OK ){
5625 
5626     u8 *pOut = &pSpace[4];
5627     u8 *pCell = pPage->aOvfl[0].pCell;
5628     u16 szCell = cellSizePtr(pPage, pCell);
5629     u8 *pStop;
5630 
5631     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
5632     assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
5633     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
5634     assemblePage(pNew, 1, &pCell, &szCell);
5635 
5636     /* If this is an auto-vacuum database, update the pointer map
5637     ** with entries for the new page, and any pointer from the
5638     ** cell on the page to an overflow page. If either of these
5639     ** operations fails, the return code is set, but the contents
5640     ** of the parent page are still manipulated by thh code below.
5641     ** That is Ok, at this point the parent page is guaranteed to
5642     ** be marked as dirty. Returning an error code will cause a
5643     ** rollback, undoing any changes made to the parent page.
5644     */
5645     if( ISAUTOVACUUM ){
5646       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
5647       if( szCell>pNew->minLocal ){
5648         ptrmapPutOvflPtr(pNew, pCell, &rc);
5649       }
5650     }
5651 
5652     /* Create a divider cell to insert into pParent. The divider cell
5653     ** consists of a 4-byte page number (the page number of pPage) and
5654     ** a variable length key value (which must be the same value as the
5655     ** largest key on pPage).
5656     **
5657     ** To find the largest key value on pPage, first find the right-most
5658     ** cell on pPage. The first two fields of this cell are the
5659     ** record-length (a variable length integer at most 32-bits in size)
5660     ** and the key value (a variable length integer, may have any value).
5661     ** The first of the while(...) loops below skips over the record-length
5662     ** field. The second while(...) loop copies the key value from the
5663     ** cell on pPage into the pSpace buffer.
5664     */
5665     pCell = findCell(pPage, pPage->nCell-1);
5666     pStop = &pCell[9];
5667     while( (*(pCell++)&0x80) && pCell<pStop );
5668     pStop = &pCell[9];
5669     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
5670 
5671     /* Insert the new divider cell into pParent. */
5672     insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
5673                0, pPage->pgno, &rc);
5674 
5675     /* Set the right-child pointer of pParent to point to the new page. */
5676     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
5677 
5678     /* Release the reference to the new page. */
5679     releasePage(pNew);
5680   }
5681 
5682   return rc;
5683 }
5684 #endif /* SQLITE_OMIT_QUICKBALANCE */
5685 
5686 #if 0
5687 /*
5688 ** This function does not contribute anything to the operation of SQLite.
5689 ** it is sometimes activated temporarily while debugging code responsible
5690 ** for setting pointer-map entries.
5691 */
5692 static int ptrmapCheckPages(MemPage **apPage, int nPage){
5693   int i, j;
5694   for(i=0; i<nPage; i++){
5695     Pgno n;
5696     u8 e;
5697     MemPage *pPage = apPage[i];
5698     BtShared *pBt = pPage->pBt;
5699     assert( pPage->isInit );
5700 
5701     for(j=0; j<pPage->nCell; j++){
5702       CellInfo info;
5703       u8 *z;
5704 
5705       z = findCell(pPage, j);
5706       btreeParseCellPtr(pPage, z, &info);
5707       if( info.iOverflow ){
5708         Pgno ovfl = get4byte(&z[info.iOverflow]);
5709         ptrmapGet(pBt, ovfl, &e, &n);
5710         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
5711       }
5712       if( !pPage->leaf ){
5713         Pgno child = get4byte(z);
5714         ptrmapGet(pBt, child, &e, &n);
5715         assert( n==pPage->pgno && e==PTRMAP_BTREE );
5716       }
5717     }
5718     if( !pPage->leaf ){
5719       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5720       ptrmapGet(pBt, child, &e, &n);
5721       assert( n==pPage->pgno && e==PTRMAP_BTREE );
5722     }
5723   }
5724   return 1;
5725 }
5726 #endif
5727 
5728 /*
5729 ** This function is used to copy the contents of the b-tree node stored
5730 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
5731 ** the pointer-map entries for each child page are updated so that the
5732 ** parent page stored in the pointer map is page pTo. If pFrom contained
5733 ** any cells with overflow page pointers, then the corresponding pointer
5734 ** map entries are also updated so that the parent page is page pTo.
5735 **
5736 ** If pFrom is currently carrying any overflow cells (entries in the
5737 ** MemPage.aOvfl[] array), they are not copied to pTo.
5738 **
5739 ** Before returning, page pTo is reinitialized using btreeInitPage().
5740 **
5741 ** The performance of this function is not critical. It is only used by
5742 ** the balance_shallower() and balance_deeper() procedures, neither of
5743 ** which are called often under normal circumstances.
5744 */
copyNodeContent(MemPage * pFrom,MemPage * pTo,int * pRC)5745 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
5746   if( (*pRC)==SQLITE_OK ){
5747     BtShared * const pBt = pFrom->pBt;
5748     u8 * const aFrom = pFrom->aData;
5749     u8 * const aTo = pTo->aData;
5750     int const iFromHdr = pFrom->hdrOffset;
5751     int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
5752     int rc;
5753     int iData;
5754 
5755 
5756     assert( pFrom->isInit );
5757     assert( pFrom->nFree>=iToHdr );
5758     assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
5759 
5760     /* Copy the b-tree node content from page pFrom to page pTo. */
5761     iData = get2byte(&aFrom[iFromHdr+5]);
5762     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
5763     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
5764 
5765     /* Reinitialize page pTo so that the contents of the MemPage structure
5766     ** match the new data. The initialization of pTo can actually fail under
5767     ** fairly obscure circumstances, even though it is a copy of initialized
5768     ** page pFrom.
5769     */
5770     pTo->isInit = 0;
5771     rc = btreeInitPage(pTo);
5772     if( rc!=SQLITE_OK ){
5773       *pRC = rc;
5774       return;
5775     }
5776 
5777     /* If this is an auto-vacuum database, update the pointer-map entries
5778     ** for any b-tree or overflow pages that pTo now contains the pointers to.
5779     */
5780     if( ISAUTOVACUUM ){
5781       *pRC = setChildPtrmaps(pTo);
5782     }
5783   }
5784 }
5785 
5786 /*
5787 ** This routine redistributes cells on the iParentIdx'th child of pParent
5788 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
5789 ** same amount of free space. Usually a single sibling on either side of the
5790 ** page are used in the balancing, though both siblings might come from one
5791 ** side if the page is the first or last child of its parent. If the page
5792 ** has fewer than 2 siblings (something which can only happen if the page
5793 ** is a root page or a child of a root page) then all available siblings
5794 ** participate in the balancing.
5795 **
5796 ** The number of siblings of the page might be increased or decreased by
5797 ** one or two in an effort to keep pages nearly full but not over full.
5798 **
5799 ** Note that when this routine is called, some of the cells on the page
5800 ** might not actually be stored in MemPage.aData[]. This can happen
5801 ** if the page is overfull. This routine ensures that all cells allocated
5802 ** to the page and its siblings fit into MemPage.aData[] before returning.
5803 **
5804 ** In the course of balancing the page and its siblings, cells may be
5805 ** inserted into or removed from the parent page (pParent). Doing so
5806 ** may cause the parent page to become overfull or underfull. If this
5807 ** happens, it is the responsibility of the caller to invoke the correct
5808 ** balancing routine to fix this problem (see the balance() routine).
5809 **
5810 ** If this routine fails for any reason, it might leave the database
5811 ** in a corrupted state. So if this routine fails, the database should
5812 ** be rolled back.
5813 **
5814 ** The third argument to this function, aOvflSpace, is a pointer to a
5815 ** buffer big enough to hold one page. If while inserting cells into the parent
5816 ** page (pParent) the parent page becomes overfull, this buffer is
5817 ** used to store the parent's overflow cells. Because this function inserts
5818 ** a maximum of four divider cells into the parent page, and the maximum
5819 ** size of a cell stored within an internal node is always less than 1/4
5820 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
5821 ** enough for all overflow cells.
5822 **
5823 ** If aOvflSpace is set to a null pointer, this function returns
5824 ** SQLITE_NOMEM.
5825 */
balance_nonroot(MemPage * pParent,int iParentIdx,u8 * aOvflSpace,int isRoot)5826 static int balance_nonroot(
5827   MemPage *pParent,               /* Parent page of siblings being balanced */
5828   int iParentIdx,                 /* Index of "the page" in pParent */
5829   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
5830   int isRoot                      /* True if pParent is a root-page */
5831 ){
5832   BtShared *pBt;               /* The whole database */
5833   int nCell = 0;               /* Number of cells in apCell[] */
5834   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
5835   int nNew = 0;                /* Number of pages in apNew[] */
5836   int nOld;                    /* Number of pages in apOld[] */
5837   int i, j, k;                 /* Loop counters */
5838   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
5839   int rc = SQLITE_OK;          /* The return code */
5840   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
5841   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
5842   int usableSpace;             /* Bytes in pPage beyond the header */
5843   int pageFlags;               /* Value of pPage->aData[0] */
5844   int subtotal;                /* Subtotal of bytes in cells on one page */
5845   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
5846   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
5847   int szScratch;               /* Size of scratch memory requested */
5848   MemPage *apOld[NB];          /* pPage and up to two siblings */
5849   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
5850   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
5851   u8 *pRight;                  /* Location in parent of right-sibling pointer */
5852   u8 *apDiv[NB-1];             /* Divider cells in pParent */
5853   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
5854   int szNew[NB+2];             /* Combined size of cells place on i-th page */
5855   u8 **apCell = 0;             /* All cells begin balanced */
5856   u16 *szCell;                 /* Local size of all cells in apCell[] */
5857   u8 *aSpace1;                 /* Space for copies of dividers cells */
5858   Pgno pgno;                   /* Temp var to store a page number in */
5859 
5860   pBt = pParent->pBt;
5861   assert( sqlite3_mutex_held(pBt->mutex) );
5862   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5863 
5864 #if 0
5865   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
5866 #endif
5867 
5868   /* At this point pParent may have at most one overflow cell. And if
5869   ** this overflow cell is present, it must be the cell with
5870   ** index iParentIdx. This scenario comes about when this function
5871   ** is called (indirectly) from sqlite3BtreeDelete().
5872   */
5873   assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
5874   assert( pParent->nOverflow==0 || pParent->aOvfl[0].idx==iParentIdx );
5875 
5876   if( !aOvflSpace ){
5877     return SQLITE_NOMEM;
5878   }
5879 
5880   /* Find the sibling pages to balance. Also locate the cells in pParent
5881   ** that divide the siblings. An attempt is made to find NN siblings on
5882   ** either side of pPage. More siblings are taken from one side, however,
5883   ** if there are fewer than NN siblings on the other side. If pParent
5884   ** has NB or fewer children then all children of pParent are taken.
5885   **
5886   ** This loop also drops the divider cells from the parent page. This
5887   ** way, the remainder of the function does not have to deal with any
5888   ** overflow cells in the parent page, since if any existed they will
5889   ** have already been removed.
5890   */
5891   i = pParent->nOverflow + pParent->nCell;
5892   if( i<2 ){
5893     nxDiv = 0;
5894     nOld = i+1;
5895   }else{
5896     nOld = 3;
5897     if( iParentIdx==0 ){
5898       nxDiv = 0;
5899     }else if( iParentIdx==i ){
5900       nxDiv = i-2;
5901     }else{
5902       nxDiv = iParentIdx-1;
5903     }
5904     i = 2;
5905   }
5906   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
5907     pRight = &pParent->aData[pParent->hdrOffset+8];
5908   }else{
5909     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
5910   }
5911   pgno = get4byte(pRight);
5912   while( 1 ){
5913     rc = getAndInitPage(pBt, pgno, &apOld[i]);
5914     if( rc ){
5915       memset(apOld, 0, (i+1)*sizeof(MemPage*));
5916       goto balance_cleanup;
5917     }
5918     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
5919     if( (i--)==0 ) break;
5920 
5921     if( i+nxDiv==pParent->aOvfl[0].idx && pParent->nOverflow ){
5922       apDiv[i] = pParent->aOvfl[0].pCell;
5923       pgno = get4byte(apDiv[i]);
5924       szNew[i] = cellSizePtr(pParent, apDiv[i]);
5925       pParent->nOverflow = 0;
5926     }else{
5927       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
5928       pgno = get4byte(apDiv[i]);
5929       szNew[i] = cellSizePtr(pParent, apDiv[i]);
5930 
5931       /* Drop the cell from the parent page. apDiv[i] still points to
5932       ** the cell within the parent, even though it has been dropped.
5933       ** This is safe because dropping a cell only overwrites the first
5934       ** four bytes of it, and this function does not need the first
5935       ** four bytes of the divider cell. So the pointer is safe to use
5936       ** later on.
5937       **
5938       ** Unless SQLite is compiled in secure-delete mode. In this case,
5939       ** the dropCell() routine will overwrite the entire cell with zeroes.
5940       ** In this case, temporarily copy the cell into the aOvflSpace[]
5941       ** buffer. It will be copied out again as soon as the aSpace[] buffer
5942       ** is allocated.  */
5943       if( pBt->secureDelete ){
5944         int iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
5945         if( (iOff+szNew[i])>(int)pBt->usableSize ){
5946           rc = SQLITE_CORRUPT_BKPT;
5947           memset(apOld, 0, (i+1)*sizeof(MemPage*));
5948           goto balance_cleanup;
5949         }else{
5950           memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
5951           apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
5952         }
5953       }
5954       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
5955     }
5956   }
5957 
5958   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
5959   ** alignment */
5960   nMaxCells = (nMaxCells + 3)&~3;
5961 
5962   /*
5963   ** Allocate space for memory structures
5964   */
5965   k = pBt->pageSize + ROUND8(sizeof(MemPage));
5966   szScratch =
5967        nMaxCells*sizeof(u8*)                       /* apCell */
5968      + nMaxCells*sizeof(u16)                       /* szCell */
5969      + pBt->pageSize                               /* aSpace1 */
5970      + k*nOld;                                     /* Page copies (apCopy) */
5971   apCell = sqlite3ScratchMalloc( szScratch );
5972   if( apCell==0 ){
5973     rc = SQLITE_NOMEM;
5974     goto balance_cleanup;
5975   }
5976   szCell = (u16*)&apCell[nMaxCells];
5977   aSpace1 = (u8*)&szCell[nMaxCells];
5978   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
5979 
5980   /*
5981   ** Load pointers to all cells on sibling pages and the divider cells
5982   ** into the local apCell[] array.  Make copies of the divider cells
5983   ** into space obtained from aSpace1[] and remove the the divider Cells
5984   ** from pParent.
5985   **
5986   ** If the siblings are on leaf pages, then the child pointers of the
5987   ** divider cells are stripped from the cells before they are copied
5988   ** into aSpace1[].  In this way, all cells in apCell[] are without
5989   ** child pointers.  If siblings are not leaves, then all cell in
5990   ** apCell[] include child pointers.  Either way, all cells in apCell[]
5991   ** are alike.
5992   **
5993   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
5994   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
5995   */
5996   leafCorrection = apOld[0]->leaf*4;
5997   leafData = apOld[0]->hasData;
5998   for(i=0; i<nOld; i++){
5999     int limit;
6000 
6001     /* Before doing anything else, take a copy of the i'th original sibling
6002     ** The rest of this function will use data from the copies rather
6003     ** that the original pages since the original pages will be in the
6004     ** process of being overwritten.  */
6005     MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i];
6006     memcpy(pOld, apOld[i], sizeof(MemPage));
6007     pOld->aData = (void*)&pOld[1];
6008     memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize);
6009 
6010     limit = pOld->nCell+pOld->nOverflow;
6011     for(j=0; j<limit; j++){
6012       assert( nCell<nMaxCells );
6013       apCell[nCell] = findOverflowCell(pOld, j);
6014       szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
6015       nCell++;
6016     }
6017     if( i<nOld-1 && !leafData){
6018       u16 sz = (u16)szNew[i];
6019       u8 *pTemp;
6020       assert( nCell<nMaxCells );
6021       szCell[nCell] = sz;
6022       pTemp = &aSpace1[iSpace1];
6023       iSpace1 += sz;
6024       assert( sz<=pBt->maxLocal+23 );
6025       assert( iSpace1 <= (int)pBt->pageSize );
6026       memcpy(pTemp, apDiv[i], sz);
6027       apCell[nCell] = pTemp+leafCorrection;
6028       assert( leafCorrection==0 || leafCorrection==4 );
6029       szCell[nCell] = szCell[nCell] - leafCorrection;
6030       if( !pOld->leaf ){
6031         assert( leafCorrection==0 );
6032         assert( pOld->hdrOffset==0 );
6033         /* The right pointer of the child page pOld becomes the left
6034         ** pointer of the divider cell */
6035         memcpy(apCell[nCell], &pOld->aData[8], 4);
6036       }else{
6037         assert( leafCorrection==4 );
6038         if( szCell[nCell]<4 ){
6039           /* Do not allow any cells smaller than 4 bytes. */
6040           szCell[nCell] = 4;
6041         }
6042       }
6043       nCell++;
6044     }
6045   }
6046 
6047   /*
6048   ** Figure out the number of pages needed to hold all nCell cells.
6049   ** Store this number in "k".  Also compute szNew[] which is the total
6050   ** size of all cells on the i-th page and cntNew[] which is the index
6051   ** in apCell[] of the cell that divides page i from page i+1.
6052   ** cntNew[k] should equal nCell.
6053   **
6054   ** Values computed by this block:
6055   **
6056   **           k: The total number of sibling pages
6057   **    szNew[i]: Spaced used on the i-th sibling page.
6058   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
6059   **              the right of the i-th sibling page.
6060   ** usableSpace: Number of bytes of space available on each sibling.
6061   **
6062   */
6063   usableSpace = pBt->usableSize - 12 + leafCorrection;
6064   for(subtotal=k=i=0; i<nCell; i++){
6065     assert( i<nMaxCells );
6066     subtotal += szCell[i] + 2;
6067     if( subtotal > usableSpace ){
6068       szNew[k] = subtotal - szCell[i];
6069       cntNew[k] = i;
6070       if( leafData ){ i--; }
6071       subtotal = 0;
6072       k++;
6073       if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
6074     }
6075   }
6076   szNew[k] = subtotal;
6077   cntNew[k] = nCell;
6078   k++;
6079 
6080   /*
6081   ** The packing computed by the previous block is biased toward the siblings
6082   ** on the left side.  The left siblings are always nearly full, while the
6083   ** right-most sibling might be nearly empty.  This block of code attempts
6084   ** to adjust the packing of siblings to get a better balance.
6085   **
6086   ** This adjustment is more than an optimization.  The packing above might
6087   ** be so out of balance as to be illegal.  For example, the right-most
6088   ** sibling might be completely empty.  This adjustment is not optional.
6089   */
6090   for(i=k-1; i>0; i--){
6091     int szRight = szNew[i];  /* Size of sibling on the right */
6092     int szLeft = szNew[i-1]; /* Size of sibling on the left */
6093     int r;              /* Index of right-most cell in left sibling */
6094     int d;              /* Index of first cell to the left of right sibling */
6095 
6096     r = cntNew[i-1] - 1;
6097     d = r + 1 - leafData;
6098     assert( d<nMaxCells );
6099     assert( r<nMaxCells );
6100     while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
6101       szRight += szCell[d] + 2;
6102       szLeft -= szCell[r] + 2;
6103       cntNew[i-1]--;
6104       r = cntNew[i-1] - 1;
6105       d = r + 1 - leafData;
6106     }
6107     szNew[i] = szRight;
6108     szNew[i-1] = szLeft;
6109   }
6110 
6111   /* Either we found one or more cells (cntnew[0])>0) or pPage is
6112   ** a virtual root page.  A virtual root page is when the real root
6113   ** page is page 1 and we are the only child of that page.
6114   */
6115   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
6116 
6117   TRACE(("BALANCE: old: %d %d %d  ",
6118     apOld[0]->pgno,
6119     nOld>=2 ? apOld[1]->pgno : 0,
6120     nOld>=3 ? apOld[2]->pgno : 0
6121   ));
6122 
6123   /*
6124   ** Allocate k new pages.  Reuse old pages where possible.
6125   */
6126   if( apOld[0]->pgno<=1 ){
6127     rc = SQLITE_CORRUPT_BKPT;
6128     goto balance_cleanup;
6129   }
6130   pageFlags = apOld[0]->aData[0];
6131   for(i=0; i<k; i++){
6132     MemPage *pNew;
6133     if( i<nOld ){
6134       pNew = apNew[i] = apOld[i];
6135       apOld[i] = 0;
6136       rc = sqlite3PagerWrite(pNew->pDbPage);
6137       nNew++;
6138       if( rc ) goto balance_cleanup;
6139     }else{
6140       assert( i>0 );
6141       rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0);
6142       if( rc ) goto balance_cleanup;
6143       apNew[i] = pNew;
6144       nNew++;
6145 
6146       /* Set the pointer-map entry for the new sibling page. */
6147       if( ISAUTOVACUUM ){
6148         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
6149         if( rc!=SQLITE_OK ){
6150           goto balance_cleanup;
6151         }
6152       }
6153     }
6154   }
6155 
6156   /* Free any old pages that were not reused as new pages.
6157   */
6158   while( i<nOld ){
6159     freePage(apOld[i], &rc);
6160     if( rc ) goto balance_cleanup;
6161     releasePage(apOld[i]);
6162     apOld[i] = 0;
6163     i++;
6164   }
6165 
6166   /*
6167   ** Put the new pages in accending order.  This helps to
6168   ** keep entries in the disk file in order so that a scan
6169   ** of the table is a linear scan through the file.  That
6170   ** in turn helps the operating system to deliver pages
6171   ** from the disk more rapidly.
6172   **
6173   ** An O(n^2) insertion sort algorithm is used, but since
6174   ** n is never more than NB (a small constant), that should
6175   ** not be a problem.
6176   **
6177   ** When NB==3, this one optimization makes the database
6178   ** about 25% faster for large insertions and deletions.
6179   */
6180   for(i=0; i<k-1; i++){
6181     int minV = apNew[i]->pgno;
6182     int minI = i;
6183     for(j=i+1; j<k; j++){
6184       if( apNew[j]->pgno<(unsigned)minV ){
6185         minI = j;
6186         minV = apNew[j]->pgno;
6187       }
6188     }
6189     if( minI>i ){
6190       MemPage *pT;
6191       pT = apNew[i];
6192       apNew[i] = apNew[minI];
6193       apNew[minI] = pT;
6194     }
6195   }
6196   TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
6197     apNew[0]->pgno, szNew[0],
6198     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
6199     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
6200     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
6201     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));
6202 
6203   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6204   put4byte(pRight, apNew[nNew-1]->pgno);
6205 
6206   /*
6207   ** Evenly distribute the data in apCell[] across the new pages.
6208   ** Insert divider cells into pParent as necessary.
6209   */
6210   j = 0;
6211   for(i=0; i<nNew; i++){
6212     /* Assemble the new sibling page. */
6213     MemPage *pNew = apNew[i];
6214     assert( j<nMaxCells );
6215     zeroPage(pNew, pageFlags);
6216     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
6217     assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
6218     assert( pNew->nOverflow==0 );
6219 
6220     j = cntNew[i];
6221 
6222     /* If the sibling page assembled above was not the right-most sibling,
6223     ** insert a divider cell into the parent page.
6224     */
6225     assert( i<nNew-1 || j==nCell );
6226     if( j<nCell ){
6227       u8 *pCell;
6228       u8 *pTemp;
6229       int sz;
6230 
6231       assert( j<nMaxCells );
6232       pCell = apCell[j];
6233       sz = szCell[j] + leafCorrection;
6234       pTemp = &aOvflSpace[iOvflSpace];
6235       if( !pNew->leaf ){
6236         memcpy(&pNew->aData[8], pCell, 4);
6237       }else if( leafData ){
6238         /* If the tree is a leaf-data tree, and the siblings are leaves,
6239         ** then there is no divider cell in apCell[]. Instead, the divider
6240         ** cell consists of the integer key for the right-most cell of
6241         ** the sibling-page assembled above only.
6242         */
6243         CellInfo info;
6244         j--;
6245         btreeParseCellPtr(pNew, apCell[j], &info);
6246         pCell = pTemp;
6247         sz = 4 + putVarint(&pCell[4], info.nKey);
6248         pTemp = 0;
6249       }else{
6250         pCell -= 4;
6251         /* Obscure case for non-leaf-data trees: If the cell at pCell was
6252         ** previously stored on a leaf node, and its reported size was 4
6253         ** bytes, then it may actually be smaller than this
6254         ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
6255         ** any cell). But it is important to pass the correct size to
6256         ** insertCell(), so reparse the cell now.
6257         **
6258         ** Note that this can never happen in an SQLite data file, as all
6259         ** cells are at least 4 bytes. It only happens in b-trees used
6260         ** to evaluate "IN (SELECT ...)" and similar clauses.
6261         */
6262         if( szCell[j]==4 ){
6263           assert(leafCorrection==4);
6264           sz = cellSizePtr(pParent, pCell);
6265         }
6266       }
6267       iOvflSpace += sz;
6268       assert( sz<=pBt->maxLocal+23 );
6269       assert( iOvflSpace <= (int)pBt->pageSize );
6270       insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);
6271       if( rc!=SQLITE_OK ) goto balance_cleanup;
6272       assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6273 
6274       j++;
6275       nxDiv++;
6276     }
6277   }
6278   assert( j==nCell );
6279   assert( nOld>0 );
6280   assert( nNew>0 );
6281   if( (pageFlags & PTF_LEAF)==0 ){
6282     u8 *zChild = &apCopy[nOld-1]->aData[8];
6283     memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
6284   }
6285 
6286   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
6287     /* The root page of the b-tree now contains no cells. The only sibling
6288     ** page is the right-child of the parent. Copy the contents of the
6289     ** child page into the parent, decreasing the overall height of the
6290     ** b-tree structure by one. This is described as the "balance-shallower"
6291     ** sub-algorithm in some documentation.
6292     **
6293     ** If this is an auto-vacuum database, the call to copyNodeContent()
6294     ** sets all pointer-map entries corresponding to database image pages
6295     ** for which the pointer is stored within the content being copied.
6296     **
6297     ** The second assert below verifies that the child page is defragmented
6298     ** (it must be, as it was just reconstructed using assemblePage()). This
6299     ** is important if the parent page happens to be page 1 of the database
6300     ** image.  */
6301     assert( nNew==1 );
6302     assert( apNew[0]->nFree ==
6303         (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
6304     );
6305     copyNodeContent(apNew[0], pParent, &rc);
6306     freePage(apNew[0], &rc);
6307   }else if( ISAUTOVACUUM ){
6308     /* Fix the pointer-map entries for all the cells that were shifted around.
6309     ** There are several different types of pointer-map entries that need to
6310     ** be dealt with by this routine. Some of these have been set already, but
6311     ** many have not. The following is a summary:
6312     **
6313     **   1) The entries associated with new sibling pages that were not
6314     **      siblings when this function was called. These have already
6315     **      been set. We don't need to worry about old siblings that were
6316     **      moved to the free-list - the freePage() code has taken care
6317     **      of those.
6318     **
6319     **   2) The pointer-map entries associated with the first overflow
6320     **      page in any overflow chains used by new divider cells. These
6321     **      have also already been taken care of by the insertCell() code.
6322     **
6323     **   3) If the sibling pages are not leaves, then the child pages of
6324     **      cells stored on the sibling pages may need to be updated.
6325     **
6326     **   4) If the sibling pages are not internal intkey nodes, then any
6327     **      overflow pages used by these cells may need to be updated
6328     **      (internal intkey nodes never contain pointers to overflow pages).
6329     **
6330     **   5) If the sibling pages are not leaves, then the pointer-map
6331     **      entries for the right-child pages of each sibling may need
6332     **      to be updated.
6333     **
6334     ** Cases 1 and 2 are dealt with above by other code. The next
6335     ** block deals with cases 3 and 4 and the one after that, case 5. Since
6336     ** setting a pointer map entry is a relatively expensive operation, this
6337     ** code only sets pointer map entries for child or overflow pages that have
6338     ** actually moved between pages.  */
6339     MemPage *pNew = apNew[0];
6340     MemPage *pOld = apCopy[0];
6341     int nOverflow = pOld->nOverflow;
6342     int iNextOld = pOld->nCell + nOverflow;
6343     int iOverflow = (nOverflow ? pOld->aOvfl[0].idx : -1);
6344     j = 0;                             /* Current 'old' sibling page */
6345     k = 0;                             /* Current 'new' sibling page */
6346     for(i=0; i<nCell; i++){
6347       int isDivider = 0;
6348       while( i==iNextOld ){
6349         /* Cell i is the cell immediately following the last cell on old
6350         ** sibling page j. If the siblings are not leaf pages of an
6351         ** intkey b-tree, then cell i was a divider cell. */
6352         pOld = apCopy[++j];
6353         iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;
6354         if( pOld->nOverflow ){
6355           nOverflow = pOld->nOverflow;
6356           iOverflow = i + !leafData + pOld->aOvfl[0].idx;
6357         }
6358         isDivider = !leafData;
6359       }
6360 
6361       assert(nOverflow>0 || iOverflow<i );
6362       assert(nOverflow<2 || pOld->aOvfl[0].idx==pOld->aOvfl[1].idx-1);
6363       assert(nOverflow<3 || pOld->aOvfl[1].idx==pOld->aOvfl[2].idx-1);
6364       if( i==iOverflow ){
6365         isDivider = 1;
6366         if( (--nOverflow)>0 ){
6367           iOverflow++;
6368         }
6369       }
6370 
6371       if( i==cntNew[k] ){
6372         /* Cell i is the cell immediately following the last cell on new
6373         ** sibling page k. If the siblings are not leaf pages of an
6374         ** intkey b-tree, then cell i is a divider cell.  */
6375         pNew = apNew[++k];
6376         if( !leafData ) continue;
6377       }
6378       assert( j<nOld );
6379       assert( k<nNew );
6380 
6381       /* If the cell was originally divider cell (and is not now) or
6382       ** an overflow cell, or if the cell was located on a different sibling
6383       ** page before the balancing, then the pointer map entries associated
6384       ** with any child or overflow pages need to be updated.  */
6385       if( isDivider || pOld->pgno!=pNew->pgno ){
6386         if( !leafCorrection ){
6387           ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);
6388         }
6389         if( szCell[i]>pNew->minLocal ){
6390           ptrmapPutOvflPtr(pNew, apCell[i], &rc);
6391         }
6392       }
6393     }
6394 
6395     if( !leafCorrection ){
6396       for(i=0; i<nNew; i++){
6397         u32 key = get4byte(&apNew[i]->aData[8]);
6398         ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
6399       }
6400     }
6401 
6402 #if 0
6403     /* The ptrmapCheckPages() contains assert() statements that verify that
6404     ** all pointer map pages are set correctly. This is helpful while
6405     ** debugging. This is usually disabled because a corrupt database may
6406     ** cause an assert() statement to fail.  */
6407     ptrmapCheckPages(apNew, nNew);
6408     ptrmapCheckPages(&pParent, 1);
6409 #endif
6410   }
6411 
6412   assert( pParent->isInit );
6413   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
6414           nOld, nNew, nCell));
6415 
6416   /*
6417   ** Cleanup before returning.
6418   */
6419 balance_cleanup:
6420   sqlite3ScratchFree(apCell);
6421   for(i=0; i<nOld; i++){
6422     releasePage(apOld[i]);
6423   }
6424   for(i=0; i<nNew; i++){
6425     releasePage(apNew[i]);
6426   }
6427 
6428   return rc;
6429 }
6430 
6431 
6432 /*
6433 ** This function is called when the root page of a b-tree structure is
6434 ** overfull (has one or more overflow pages).
6435 **
6436 ** A new child page is allocated and the contents of the current root
6437 ** page, including overflow cells, are copied into the child. The root
6438 ** page is then overwritten to make it an empty page with the right-child
6439 ** pointer pointing to the new page.
6440 **
6441 ** Before returning, all pointer-map entries corresponding to pages
6442 ** that the new child-page now contains pointers to are updated. The
6443 ** entry corresponding to the new right-child pointer of the root
6444 ** page is also updated.
6445 **
6446 ** If successful, *ppChild is set to contain a reference to the child
6447 ** page and SQLITE_OK is returned. In this case the caller is required
6448 ** to call releasePage() on *ppChild exactly once. If an error occurs,
6449 ** an error code is returned and *ppChild is set to 0.
6450 */
balance_deeper(MemPage * pRoot,MemPage ** ppChild)6451 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
6452   int rc;                        /* Return value from subprocedures */
6453   MemPage *pChild = 0;           /* Pointer to a new child page */
6454   Pgno pgnoChild = 0;            /* Page number of the new child page */
6455   BtShared *pBt = pRoot->pBt;    /* The BTree */
6456 
6457   assert( pRoot->nOverflow>0 );
6458   assert( sqlite3_mutex_held(pBt->mutex) );
6459 
6460   /* Make pRoot, the root page of the b-tree, writable. Allocate a new
6461   ** page that will become the new right-child of pPage. Copy the contents
6462   ** of the node stored on pRoot into the new child page.
6463   */
6464   rc = sqlite3PagerWrite(pRoot->pDbPage);
6465   if( rc==SQLITE_OK ){
6466     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
6467     copyNodeContent(pRoot, pChild, &rc);
6468     if( ISAUTOVACUUM ){
6469       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
6470     }
6471   }
6472   if( rc ){
6473     *ppChild = 0;
6474     releasePage(pChild);
6475     return rc;
6476   }
6477   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
6478   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
6479   assert( pChild->nCell==pRoot->nCell );
6480 
6481   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
6482 
6483   /* Copy the overflow cells from pRoot to pChild */
6484   memcpy(pChild->aOvfl, pRoot->aOvfl, pRoot->nOverflow*sizeof(pRoot->aOvfl[0]));
6485   pChild->nOverflow = pRoot->nOverflow;
6486 
6487   /* Zero the contents of pRoot. Then install pChild as the right-child. */
6488   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
6489   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
6490 
6491   *ppChild = pChild;
6492   return SQLITE_OK;
6493 }
6494 
6495 /*
6496 ** The page that pCur currently points to has just been modified in
6497 ** some way. This function figures out if this modification means the
6498 ** tree needs to be balanced, and if so calls the appropriate balancing
6499 ** routine. Balancing routines are:
6500 **
6501 **   balance_quick()
6502 **   balance_deeper()
6503 **   balance_nonroot()
6504 */
balance(BtCursor * pCur)6505 static int balance(BtCursor *pCur){
6506   int rc = SQLITE_OK;
6507   const int nMin = pCur->pBt->usableSize * 2 / 3;
6508   u8 aBalanceQuickSpace[13];
6509   u8 *pFree = 0;
6510 
6511   TESTONLY( int balance_quick_called = 0 );
6512   TESTONLY( int balance_deeper_called = 0 );
6513 
6514   do {
6515     int iPage = pCur->iPage;
6516     MemPage *pPage = pCur->apPage[iPage];
6517 
6518     if( iPage==0 ){
6519       if( pPage->nOverflow ){
6520         /* The root page of the b-tree is overfull. In this case call the
6521         ** balance_deeper() function to create a new child for the root-page
6522         ** and copy the current contents of the root-page to it. The
6523         ** next iteration of the do-loop will balance the child page.
6524         */
6525         assert( (balance_deeper_called++)==0 );
6526         rc = balance_deeper(pPage, &pCur->apPage[1]);
6527         if( rc==SQLITE_OK ){
6528           pCur->iPage = 1;
6529           pCur->aiIdx[0] = 0;
6530           pCur->aiIdx[1] = 0;
6531           assert( pCur->apPage[1]->nOverflow );
6532         }
6533       }else{
6534         break;
6535       }
6536     }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
6537       break;
6538     }else{
6539       MemPage * const pParent = pCur->apPage[iPage-1];
6540       int const iIdx = pCur->aiIdx[iPage-1];
6541 
6542       rc = sqlite3PagerWrite(pParent->pDbPage);
6543       if( rc==SQLITE_OK ){
6544 #ifndef SQLITE_OMIT_QUICKBALANCE
6545         if( pPage->hasData
6546          && pPage->nOverflow==1
6547          && pPage->aOvfl[0].idx==pPage->nCell
6548          && pParent->pgno!=1
6549          && pParent->nCell==iIdx
6550         ){
6551           /* Call balance_quick() to create a new sibling of pPage on which
6552           ** to store the overflow cell. balance_quick() inserts a new cell
6553           ** into pParent, which may cause pParent overflow. If this
6554           ** happens, the next interation of the do-loop will balance pParent
6555           ** use either balance_nonroot() or balance_deeper(). Until this
6556           ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
6557           ** buffer.
6558           **
6559           ** The purpose of the following assert() is to check that only a
6560           ** single call to balance_quick() is made for each call to this
6561           ** function. If this were not verified, a subtle bug involving reuse
6562           ** of the aBalanceQuickSpace[] might sneak in.
6563           */
6564           assert( (balance_quick_called++)==0 );
6565           rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
6566         }else
6567 #endif
6568         {
6569           /* In this case, call balance_nonroot() to redistribute cells
6570           ** between pPage and up to 2 of its sibling pages. This involves
6571           ** modifying the contents of pParent, which may cause pParent to
6572           ** become overfull or underfull. The next iteration of the do-loop
6573           ** will balance the parent page to correct this.
6574           **
6575           ** If the parent page becomes overfull, the overflow cell or cells
6576           ** are stored in the pSpace buffer allocated immediately below.
6577           ** A subsequent iteration of the do-loop will deal with this by
6578           ** calling balance_nonroot() (balance_deeper() may be called first,
6579           ** but it doesn't deal with overflow cells - just moves them to a
6580           ** different page). Once this subsequent call to balance_nonroot()
6581           ** has completed, it is safe to release the pSpace buffer used by
6582           ** the previous call, as the overflow cell data will have been
6583           ** copied either into the body of a database page or into the new
6584           ** pSpace buffer passed to the latter call to balance_nonroot().
6585           */
6586           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
6587           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1);
6588           if( pFree ){
6589             /* If pFree is not NULL, it points to the pSpace buffer used
6590             ** by a previous call to balance_nonroot(). Its contents are
6591             ** now stored either on real database pages or within the
6592             ** new pSpace buffer, so it may be safely freed here. */
6593             sqlite3PageFree(pFree);
6594           }
6595 
6596           /* The pSpace buffer will be freed after the next call to
6597           ** balance_nonroot(), or just before this function returns, whichever
6598           ** comes first. */
6599           pFree = pSpace;
6600         }
6601       }
6602 
6603       pPage->nOverflow = 0;
6604 
6605       /* The next iteration of the do-loop balances the parent page. */
6606       releasePage(pPage);
6607       pCur->iPage--;
6608     }
6609   }while( rc==SQLITE_OK );
6610 
6611   if( pFree ){
6612     sqlite3PageFree(pFree);
6613   }
6614   return rc;
6615 }
6616 
6617 
6618 /*
6619 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
6620 ** and the data is given by (pData,nData).  The cursor is used only to
6621 ** define what table the record should be inserted into.  The cursor
6622 ** is left pointing at a random location.
6623 **
6624 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
6625 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
6626 **
6627 ** If the seekResult parameter is non-zero, then a successful call to
6628 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already
6629 ** been performed. seekResult is the search result returned (a negative
6630 ** number if pCur points at an entry that is smaller than (pKey, nKey), or
6631 ** a positive value if pCur points at an etry that is larger than
6632 ** (pKey, nKey)).
6633 **
6634 ** If the seekResult parameter is non-zero, then the caller guarantees that
6635 ** cursor pCur is pointing at the existing copy of a row that is to be
6636 ** overwritten.  If the seekResult parameter is 0, then cursor pCur may
6637 ** point to any entry or to no entry at all and so this function has to seek
6638 ** the cursor before the new key can be inserted.
6639 */
sqlite3BtreeInsert(BtCursor * pCur,const void * pKey,i64 nKey,const void * pData,int nData,int nZero,int appendBias,int seekResult)6640 int sqlite3BtreeInsert(
6641   BtCursor *pCur,                /* Insert data into the table of this cursor */
6642   const void *pKey, i64 nKey,    /* The key of the new record */
6643   const void *pData, int nData,  /* The data of the new record */
6644   int nZero,                     /* Number of extra 0 bytes to append to data */
6645   int appendBias,                /* True if this is likely an append */
6646   int seekResult                 /* Result of prior MovetoUnpacked() call */
6647 ){
6648   int rc;
6649   int loc = seekResult;          /* -1: before desired location  +1: after */
6650   int szNew = 0;
6651   int idx;
6652   MemPage *pPage;
6653   Btree *p = pCur->pBtree;
6654   BtShared *pBt = p->pBt;
6655   unsigned char *oldCell;
6656   unsigned char *newCell = 0;
6657 
6658   if( pCur->eState==CURSOR_FAULT ){
6659     assert( pCur->skipNext!=SQLITE_OK );
6660     return pCur->skipNext;
6661   }
6662 
6663   assert( cursorHoldsMutex(pCur) );
6664   assert( pCur->wrFlag && pBt->inTransaction==TRANS_WRITE && !pBt->readOnly );
6665   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6666 
6667   /* Assert that the caller has been consistent. If this cursor was opened
6668   ** expecting an index b-tree, then the caller should be inserting blob
6669   ** keys with no associated data. If the cursor was opened expecting an
6670   ** intkey table, the caller should be inserting integer keys with a
6671   ** blob of associated data.  */
6672   assert( (pKey==0)==(pCur->pKeyInfo==0) );
6673 
6674   /* If this is an insert into a table b-tree, invalidate any incrblob
6675   ** cursors open on the row being replaced (assuming this is a replace
6676   ** operation - if it is not, the following is a no-op).  */
6677   if( pCur->pKeyInfo==0 ){
6678     invalidateIncrblobCursors(p, nKey, 0);
6679   }
6680 
6681   /* Save the positions of any other cursors open on this table.
6682   **
6683   ** In some cases, the call to btreeMoveto() below is a no-op. For
6684   ** example, when inserting data into a table with auto-generated integer
6685   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
6686   ** integer key to use. It then calls this function to actually insert the
6687   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
6688   ** that the cursor is already where it needs to be and returns without
6689   ** doing any work. To avoid thwarting these optimizations, it is important
6690   ** not to clear the cursor here.
6691   */
6692   rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
6693   if( rc ) return rc;
6694   if( !loc ){
6695     rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc);
6696     if( rc ) return rc;
6697   }
6698   assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
6699 
6700   pPage = pCur->apPage[pCur->iPage];
6701   assert( pPage->intKey || nKey>=0 );
6702   assert( pPage->leaf || !pPage->intKey );
6703 
6704   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
6705           pCur->pgnoRoot, nKey, nData, pPage->pgno,
6706           loc==0 ? "overwrite" : "new entry"));
6707   assert( pPage->isInit );
6708   allocateTempSpace(pBt);
6709   newCell = pBt->pTmpSpace;
6710   if( newCell==0 ) return SQLITE_NOMEM;
6711   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
6712   if( rc ) goto end_insert;
6713   assert( szNew==cellSizePtr(pPage, newCell) );
6714   assert( szNew <= MX_CELL_SIZE(pBt) );
6715   idx = pCur->aiIdx[pCur->iPage];
6716   if( loc==0 ){
6717     u16 szOld;
6718     assert( idx<pPage->nCell );
6719     rc = sqlite3PagerWrite(pPage->pDbPage);
6720     if( rc ){
6721       goto end_insert;
6722     }
6723     oldCell = findCell(pPage, idx);
6724     if( !pPage->leaf ){
6725       memcpy(newCell, oldCell, 4);
6726     }
6727     szOld = cellSizePtr(pPage, oldCell);
6728     rc = clearCell(pPage, oldCell);
6729     dropCell(pPage, idx, szOld, &rc);
6730     if( rc ) goto end_insert;
6731   }else if( loc<0 && pPage->nCell>0 ){
6732     assert( pPage->leaf );
6733     idx = ++pCur->aiIdx[pCur->iPage];
6734   }else{
6735     assert( pPage->leaf );
6736   }
6737   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
6738   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
6739 
6740   /* If no error has occured and pPage has an overflow cell, call balance()
6741   ** to redistribute the cells within the tree. Since balance() may move
6742   ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey
6743   ** variables.
6744   **
6745   ** Previous versions of SQLite called moveToRoot() to move the cursor
6746   ** back to the root page as balance() used to invalidate the contents
6747   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
6748   ** set the cursor state to "invalid". This makes common insert operations
6749   ** slightly faster.
6750   **
6751   ** There is a subtle but important optimization here too. When inserting
6752   ** multiple records into an intkey b-tree using a single cursor (as can
6753   ** happen while processing an "INSERT INTO ... SELECT" statement), it
6754   ** is advantageous to leave the cursor pointing to the last entry in
6755   ** the b-tree if possible. If the cursor is left pointing to the last
6756   ** entry in the table, and the next row inserted has an integer key
6757   ** larger than the largest existing key, it is possible to insert the
6758   ** row without seeking the cursor. This can be a big performance boost.
6759   */
6760   pCur->info.nSize = 0;
6761   pCur->validNKey = 0;
6762   if( rc==SQLITE_OK && pPage->nOverflow ){
6763     rc = balance(pCur);
6764 
6765     /* Must make sure nOverflow is reset to zero even if the balance()
6766     ** fails. Internal data structure corruption will result otherwise.
6767     ** Also, set the cursor state to invalid. This stops saveCursorPosition()
6768     ** from trying to save the current position of the cursor.  */
6769     pCur->apPage[pCur->iPage]->nOverflow = 0;
6770     pCur->eState = CURSOR_INVALID;
6771   }
6772   assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
6773 
6774 end_insert:
6775   return rc;
6776 }
6777 
6778 /*
6779 ** Delete the entry that the cursor is pointing to.  The cursor
6780 ** is left pointing at a arbitrary location.
6781 */
sqlite3BtreeDelete(BtCursor * pCur)6782 int sqlite3BtreeDelete(BtCursor *pCur){
6783   Btree *p = pCur->pBtree;
6784   BtShared *pBt = p->pBt;
6785   int rc;                              /* Return code */
6786   MemPage *pPage;                      /* Page to delete cell from */
6787   unsigned char *pCell;                /* Pointer to cell to delete */
6788   int iCellIdx;                        /* Index of cell to delete */
6789   int iCellDepth;                      /* Depth of node containing pCell */
6790 
6791   assert( cursorHoldsMutex(pCur) );
6792   assert( pBt->inTransaction==TRANS_WRITE );
6793   assert( !pBt->readOnly );
6794   assert( pCur->wrFlag );
6795   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6796   assert( !hasReadConflicts(p, pCur->pgnoRoot) );
6797 
6798   if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell)
6799    || NEVER(pCur->eState!=CURSOR_VALID)
6800   ){
6801     return SQLITE_ERROR;  /* Something has gone awry. */
6802   }
6803 
6804   /* If this is a delete operation to remove a row from a table b-tree,
6805   ** invalidate any incrblob cursors open on the row being deleted.  */
6806   if( pCur->pKeyInfo==0 ){
6807     invalidateIncrblobCursors(p, pCur->info.nKey, 0);
6808   }
6809 
6810   iCellDepth = pCur->iPage;
6811   iCellIdx = pCur->aiIdx[iCellDepth];
6812   pPage = pCur->apPage[iCellDepth];
6813   pCell = findCell(pPage, iCellIdx);
6814 
6815   /* If the page containing the entry to delete is not a leaf page, move
6816   ** the cursor to the largest entry in the tree that is smaller than
6817   ** the entry being deleted. This cell will replace the cell being deleted
6818   ** from the internal node. The 'previous' entry is used for this instead
6819   ** of the 'next' entry, as the previous entry is always a part of the
6820   ** sub-tree headed by the child page of the cell being deleted. This makes
6821   ** balancing the tree following the delete operation easier.  */
6822   if( !pPage->leaf ){
6823     int notUsed;
6824     rc = sqlite3BtreePrevious(pCur, &notUsed);
6825     if( rc ) return rc;
6826   }
6827 
6828   /* Save the positions of any other cursors open on this table before
6829   ** making any modifications. Make the page containing the entry to be
6830   ** deleted writable. Then free any overflow pages associated with the
6831   ** entry and finally remove the cell itself from within the page.
6832   */
6833   rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
6834   if( rc ) return rc;
6835   rc = sqlite3PagerWrite(pPage->pDbPage);
6836   if( rc ) return rc;
6837   rc = clearCell(pPage, pCell);
6838   dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc);
6839   if( rc ) return rc;
6840 
6841   /* If the cell deleted was not located on a leaf page, then the cursor
6842   ** is currently pointing to the largest entry in the sub-tree headed
6843   ** by the child-page of the cell that was just deleted from an internal
6844   ** node. The cell from the leaf node needs to be moved to the internal
6845   ** node to replace the deleted cell.  */
6846   if( !pPage->leaf ){
6847     MemPage *pLeaf = pCur->apPage[pCur->iPage];
6848     int nCell;
6849     Pgno n = pCur->apPage[iCellDepth+1]->pgno;
6850     unsigned char *pTmp;
6851 
6852     pCell = findCell(pLeaf, pLeaf->nCell-1);
6853     nCell = cellSizePtr(pLeaf, pCell);
6854     assert( MX_CELL_SIZE(pBt) >= nCell );
6855 
6856     allocateTempSpace(pBt);
6857     pTmp = pBt->pTmpSpace;
6858 
6859     rc = sqlite3PagerWrite(pLeaf->pDbPage);
6860     insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
6861     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
6862     if( rc ) return rc;
6863   }
6864 
6865   /* Balance the tree. If the entry deleted was located on a leaf page,
6866   ** then the cursor still points to that page. In this case the first
6867   ** call to balance() repairs the tree, and the if(...) condition is
6868   ** never true.
6869   **
6870   ** Otherwise, if the entry deleted was on an internal node page, then
6871   ** pCur is pointing to the leaf page from which a cell was removed to
6872   ** replace the cell deleted from the internal node. This is slightly
6873   ** tricky as the leaf node may be underfull, and the internal node may
6874   ** be either under or overfull. In this case run the balancing algorithm
6875   ** on the leaf node first. If the balance proceeds far enough up the
6876   ** tree that we can be sure that any problem in the internal node has
6877   ** been corrected, so be it. Otherwise, after balancing the leaf node,
6878   ** walk the cursor up the tree to the internal node and balance it as
6879   ** well.  */
6880   rc = balance(pCur);
6881   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
6882     while( pCur->iPage>iCellDepth ){
6883       releasePage(pCur->apPage[pCur->iPage--]);
6884     }
6885     rc = balance(pCur);
6886   }
6887 
6888   if( rc==SQLITE_OK ){
6889     moveToRoot(pCur);
6890   }
6891   return rc;
6892 }
6893 
6894 /*
6895 ** Create a new BTree table.  Write into *piTable the page
6896 ** number for the root page of the new table.
6897 **
6898 ** The type of type is determined by the flags parameter.  Only the
6899 ** following values of flags are currently in use.  Other values for
6900 ** flags might not work:
6901 **
6902 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
6903 **     BTREE_ZERODATA                  Used for SQL indices
6904 */
btreeCreateTable(Btree * p,int * piTable,int createTabFlags)6905 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
6906   BtShared *pBt = p->pBt;
6907   MemPage *pRoot;
6908   Pgno pgnoRoot;
6909   int rc;
6910   int ptfFlags;          /* Page-type flage for the root page of new table */
6911 
6912   assert( sqlite3BtreeHoldsMutex(p) );
6913   assert( pBt->inTransaction==TRANS_WRITE );
6914   assert( !pBt->readOnly );
6915 
6916 #ifdef SQLITE_OMIT_AUTOVACUUM
6917   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
6918   if( rc ){
6919     return rc;
6920   }
6921 #else
6922   if( pBt->autoVacuum ){
6923     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
6924     MemPage *pPageMove; /* The page to move to. */
6925 
6926     /* Creating a new table may probably require moving an existing database
6927     ** to make room for the new tables root page. In case this page turns
6928     ** out to be an overflow page, delete all overflow page-map caches
6929     ** held by open cursors.
6930     */
6931     invalidateAllOverflowCache(pBt);
6932 
6933     /* Read the value of meta[3] from the database to determine where the
6934     ** root page of the new table should go. meta[3] is the largest root-page
6935     ** created so far, so the new root-page is (meta[3]+1).
6936     */
6937     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
6938     pgnoRoot++;
6939 
6940     /* The new root-page may not be allocated on a pointer-map page, or the
6941     ** PENDING_BYTE page.
6942     */
6943     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
6944         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
6945       pgnoRoot++;
6946     }
6947     assert( pgnoRoot>=3 );
6948 
6949     /* Allocate a page. The page that currently resides at pgnoRoot will
6950     ** be moved to the allocated page (unless the allocated page happens
6951     ** to reside at pgnoRoot).
6952     */
6953     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
6954     if( rc!=SQLITE_OK ){
6955       return rc;
6956     }
6957 
6958     if( pgnoMove!=pgnoRoot ){
6959       /* pgnoRoot is the page that will be used for the root-page of
6960       ** the new table (assuming an error did not occur). But we were
6961       ** allocated pgnoMove. If required (i.e. if it was not allocated
6962       ** by extending the file), the current page at position pgnoMove
6963       ** is already journaled.
6964       */
6965       u8 eType = 0;
6966       Pgno iPtrPage = 0;
6967 
6968       releasePage(pPageMove);
6969 
6970       /* Move the page currently at pgnoRoot to pgnoMove. */
6971       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6972       if( rc!=SQLITE_OK ){
6973         return rc;
6974       }
6975       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
6976       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
6977         rc = SQLITE_CORRUPT_BKPT;
6978       }
6979       if( rc!=SQLITE_OK ){
6980         releasePage(pRoot);
6981         return rc;
6982       }
6983       assert( eType!=PTRMAP_ROOTPAGE );
6984       assert( eType!=PTRMAP_FREEPAGE );
6985       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
6986       releasePage(pRoot);
6987 
6988       /* Obtain the page at pgnoRoot */
6989       if( rc!=SQLITE_OK ){
6990         return rc;
6991       }
6992       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6993       if( rc!=SQLITE_OK ){
6994         return rc;
6995       }
6996       rc = sqlite3PagerWrite(pRoot->pDbPage);
6997       if( rc!=SQLITE_OK ){
6998         releasePage(pRoot);
6999         return rc;
7000       }
7001     }else{
7002       pRoot = pPageMove;
7003     }
7004 
7005     /* Update the pointer-map and meta-data with the new root-page number. */
7006     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
7007     if( rc ){
7008       releasePage(pRoot);
7009       return rc;
7010     }
7011 
7012     /* When the new root page was allocated, page 1 was made writable in
7013     ** order either to increase the database filesize, or to decrement the
7014     ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
7015     */
7016     assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
7017     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
7018     if( NEVER(rc) ){
7019       releasePage(pRoot);
7020       return rc;
7021     }
7022 
7023   }else{
7024     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
7025     if( rc ) return rc;
7026   }
7027 #endif
7028   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
7029   if( createTabFlags & BTREE_INTKEY ){
7030     ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
7031   }else{
7032     ptfFlags = PTF_ZERODATA | PTF_LEAF;
7033   }
7034   zeroPage(pRoot, ptfFlags);
7035   sqlite3PagerUnref(pRoot->pDbPage);
7036   assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
7037   *piTable = (int)pgnoRoot;
7038   return SQLITE_OK;
7039 }
sqlite3BtreeCreateTable(Btree * p,int * piTable,int flags)7040 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
7041   int rc;
7042   sqlite3BtreeEnter(p);
7043   rc = btreeCreateTable(p, piTable, flags);
7044   sqlite3BtreeLeave(p);
7045   return rc;
7046 }
7047 
7048 /*
7049 ** Erase the given database page and all its children.  Return
7050 ** the page to the freelist.
7051 */
clearDatabasePage(BtShared * pBt,Pgno pgno,int freePageFlag,int * pnChange)7052 static int clearDatabasePage(
7053   BtShared *pBt,           /* The BTree that contains the table */
7054   Pgno pgno,               /* Page number to clear */
7055   int freePageFlag,        /* Deallocate page if true */
7056   int *pnChange            /* Add number of Cells freed to this counter */
7057 ){
7058   MemPage *pPage;
7059   int rc;
7060   unsigned char *pCell;
7061   int i;
7062 
7063   assert( sqlite3_mutex_held(pBt->mutex) );
7064   if( pgno>btreePagecount(pBt) ){
7065     return SQLITE_CORRUPT_BKPT;
7066   }
7067 
7068   rc = getAndInitPage(pBt, pgno, &pPage);
7069   if( rc ) return rc;
7070   for(i=0; i<pPage->nCell; i++){
7071     pCell = findCell(pPage, i);
7072     if( !pPage->leaf ){
7073       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
7074       if( rc ) goto cleardatabasepage_out;
7075     }
7076     rc = clearCell(pPage, pCell);
7077     if( rc ) goto cleardatabasepage_out;
7078   }
7079   if( !pPage->leaf ){
7080     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange);
7081     if( rc ) goto cleardatabasepage_out;
7082   }else if( pnChange ){
7083     assert( pPage->intKey );
7084     *pnChange += pPage->nCell;
7085   }
7086   if( freePageFlag ){
7087     freePage(pPage, &rc);
7088   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
7089     zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
7090   }
7091 
7092 cleardatabasepage_out:
7093   releasePage(pPage);
7094   return rc;
7095 }
7096 
7097 /*
7098 ** Delete all information from a single table in the database.  iTable is
7099 ** the page number of the root of the table.  After this routine returns,
7100 ** the root page is empty, but still exists.
7101 **
7102 ** This routine will fail with SQLITE_LOCKED if there are any open
7103 ** read cursors on the table.  Open write cursors are moved to the
7104 ** root of the table.
7105 **
7106 ** If pnChange is not NULL, then table iTable must be an intkey table. The
7107 ** integer value pointed to by pnChange is incremented by the number of
7108 ** entries in the table.
7109 */
sqlite3BtreeClearTable(Btree * p,int iTable,int * pnChange)7110 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
7111   int rc;
7112   BtShared *pBt = p->pBt;
7113   sqlite3BtreeEnter(p);
7114   assert( p->inTrans==TRANS_WRITE );
7115 
7116   /* Invalidate all incrblob cursors open on table iTable (assuming iTable
7117   ** is the root of a table b-tree - if it is not, the following call is
7118   ** a no-op).  */
7119   invalidateIncrblobCursors(p, 0, 1);
7120 
7121   rc = saveAllCursors(pBt, (Pgno)iTable, 0);
7122   if( SQLITE_OK==rc ){
7123     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
7124   }
7125   sqlite3BtreeLeave(p);
7126   return rc;
7127 }
7128 
7129 /*
7130 ** Erase all information in a table and add the root of the table to
7131 ** the freelist.  Except, the root of the principle table (the one on
7132 ** page 1) is never added to the freelist.
7133 **
7134 ** This routine will fail with SQLITE_LOCKED if there are any open
7135 ** cursors on the table.
7136 **
7137 ** If AUTOVACUUM is enabled and the page at iTable is not the last
7138 ** root page in the database file, then the last root page
7139 ** in the database file is moved into the slot formerly occupied by
7140 ** iTable and that last slot formerly occupied by the last root page
7141 ** is added to the freelist instead of iTable.  In this say, all
7142 ** root pages are kept at the beginning of the database file, which
7143 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
7144 ** page number that used to be the last root page in the file before
7145 ** the move.  If no page gets moved, *piMoved is set to 0.
7146 ** The last root page is recorded in meta[3] and the value of
7147 ** meta[3] is updated by this procedure.
7148 */
btreeDropTable(Btree * p,Pgno iTable,int * piMoved)7149 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
7150   int rc;
7151   MemPage *pPage = 0;
7152   BtShared *pBt = p->pBt;
7153 
7154   assert( sqlite3BtreeHoldsMutex(p) );
7155   assert( p->inTrans==TRANS_WRITE );
7156 
7157   /* It is illegal to drop a table if any cursors are open on the
7158   ** database. This is because in auto-vacuum mode the backend may
7159   ** need to move another root-page to fill a gap left by the deleted
7160   ** root page. If an open cursor was using this page a problem would
7161   ** occur.
7162   **
7163   ** This error is caught long before control reaches this point.
7164   */
7165   if( NEVER(pBt->pCursor) ){
7166     sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);
7167     return SQLITE_LOCKED_SHAREDCACHE;
7168   }
7169 
7170   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
7171   if( rc ) return rc;
7172   rc = sqlite3BtreeClearTable(p, iTable, 0);
7173   if( rc ){
7174     releasePage(pPage);
7175     return rc;
7176   }
7177 
7178   *piMoved = 0;
7179 
7180   if( iTable>1 ){
7181 #ifdef SQLITE_OMIT_AUTOVACUUM
7182     freePage(pPage, &rc);
7183     releasePage(pPage);
7184 #else
7185     if( pBt->autoVacuum ){
7186       Pgno maxRootPgno;
7187       sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
7188 
7189       if( iTable==maxRootPgno ){
7190         /* If the table being dropped is the table with the largest root-page
7191         ** number in the database, put the root page on the free list.
7192         */
7193         freePage(pPage, &rc);
7194         releasePage(pPage);
7195         if( rc!=SQLITE_OK ){
7196           return rc;
7197         }
7198       }else{
7199         /* The table being dropped does not have the largest root-page
7200         ** number in the database. So move the page that does into the
7201         ** gap left by the deleted root-page.
7202         */
7203         MemPage *pMove;
7204         releasePage(pPage);
7205         rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
7206         if( rc!=SQLITE_OK ){
7207           return rc;
7208         }
7209         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
7210         releasePage(pMove);
7211         if( rc!=SQLITE_OK ){
7212           return rc;
7213         }
7214         pMove = 0;
7215         rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
7216         freePage(pMove, &rc);
7217         releasePage(pMove);
7218         if( rc!=SQLITE_OK ){
7219           return rc;
7220         }
7221         *piMoved = maxRootPgno;
7222       }
7223 
7224       /* Set the new 'max-root-page' value in the database header. This
7225       ** is the old value less one, less one more if that happens to
7226       ** be a root-page number, less one again if that is the
7227       ** PENDING_BYTE_PAGE.
7228       */
7229       maxRootPgno--;
7230       while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
7231              || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
7232         maxRootPgno--;
7233       }
7234       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
7235 
7236       rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
7237     }else{
7238       freePage(pPage, &rc);
7239       releasePage(pPage);
7240     }
7241 #endif
7242   }else{
7243     /* If sqlite3BtreeDropTable was called on page 1.
7244     ** This really never should happen except in a corrupt
7245     ** database.
7246     */
7247     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
7248     releasePage(pPage);
7249   }
7250   return rc;
7251 }
sqlite3BtreeDropTable(Btree * p,int iTable,int * piMoved)7252 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
7253   int rc;
7254   sqlite3BtreeEnter(p);
7255   rc = btreeDropTable(p, iTable, piMoved);
7256   sqlite3BtreeLeave(p);
7257   return rc;
7258 }
7259 
7260 
7261 /*
7262 ** This function may only be called if the b-tree connection already
7263 ** has a read or write transaction open on the database.
7264 **
7265 ** Read the meta-information out of a database file.  Meta[0]
7266 ** is the number of free pages currently in the database.  Meta[1]
7267 ** through meta[15] are available for use by higher layers.  Meta[0]
7268 ** is read-only, the others are read/write.
7269 **
7270 ** The schema layer numbers meta values differently.  At the schema
7271 ** layer (and the SetCookie and ReadCookie opcodes) the number of
7272 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
7273 */
sqlite3BtreeGetMeta(Btree * p,int idx,u32 * pMeta)7274 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
7275   BtShared *pBt = p->pBt;
7276 
7277   sqlite3BtreeEnter(p);
7278   assert( p->inTrans>TRANS_NONE );
7279   assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
7280   assert( pBt->pPage1 );
7281   assert( idx>=0 && idx<=15 );
7282 
7283   *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
7284 
7285   /* If auto-vacuum is disabled in this build and this is an auto-vacuum
7286   ** database, mark the database as read-only.  */
7287 #ifdef SQLITE_OMIT_AUTOVACUUM
7288   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ) pBt->readOnly = 1;
7289 #endif
7290 
7291   sqlite3BtreeLeave(p);
7292 }
7293 
7294 /*
7295 ** Write meta-information back into the database.  Meta[0] is
7296 ** read-only and may not be written.
7297 */
sqlite3BtreeUpdateMeta(Btree * p,int idx,u32 iMeta)7298 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
7299   BtShared *pBt = p->pBt;
7300   unsigned char *pP1;
7301   int rc;
7302   assert( idx>=1 && idx<=15 );
7303   sqlite3BtreeEnter(p);
7304   assert( p->inTrans==TRANS_WRITE );
7305   assert( pBt->pPage1!=0 );
7306   pP1 = pBt->pPage1->aData;
7307   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
7308   if( rc==SQLITE_OK ){
7309     put4byte(&pP1[36 + idx*4], iMeta);
7310 #ifndef SQLITE_OMIT_AUTOVACUUM
7311     if( idx==BTREE_INCR_VACUUM ){
7312       assert( pBt->autoVacuum || iMeta==0 );
7313       assert( iMeta==0 || iMeta==1 );
7314       pBt->incrVacuum = (u8)iMeta;
7315     }
7316 #endif
7317   }
7318   sqlite3BtreeLeave(p);
7319   return rc;
7320 }
7321 
7322 #ifndef SQLITE_OMIT_BTREECOUNT
7323 /*
7324 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
7325 ** number of entries in the b-tree and write the result to *pnEntry.
7326 **
7327 ** SQLITE_OK is returned if the operation is successfully executed.
7328 ** Otherwise, if an error is encountered (i.e. an IO error or database
7329 ** corruption) an SQLite error code is returned.
7330 */
sqlite3BtreeCount(BtCursor * pCur,i64 * pnEntry)7331 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
7332   i64 nEntry = 0;                      /* Value to return in *pnEntry */
7333   int rc;                              /* Return code */
7334   rc = moveToRoot(pCur);
7335 
7336   /* Unless an error occurs, the following loop runs one iteration for each
7337   ** page in the B-Tree structure (not including overflow pages).
7338   */
7339   while( rc==SQLITE_OK ){
7340     int iIdx;                          /* Index of child node in parent */
7341     MemPage *pPage;                    /* Current page of the b-tree */
7342 
7343     /* If this is a leaf page or the tree is not an int-key tree, then
7344     ** this page contains countable entries. Increment the entry counter
7345     ** accordingly.
7346     */
7347     pPage = pCur->apPage[pCur->iPage];
7348     if( pPage->leaf || !pPage->intKey ){
7349       nEntry += pPage->nCell;
7350     }
7351 
7352     /* pPage is a leaf node. This loop navigates the cursor so that it
7353     ** points to the first interior cell that it points to the parent of
7354     ** the next page in the tree that has not yet been visited. The
7355     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
7356     ** of the page, or to the number of cells in the page if the next page
7357     ** to visit is the right-child of its parent.
7358     **
7359     ** If all pages in the tree have been visited, return SQLITE_OK to the
7360     ** caller.
7361     */
7362     if( pPage->leaf ){
7363       do {
7364         if( pCur->iPage==0 ){
7365           /* All pages of the b-tree have been visited. Return successfully. */
7366           *pnEntry = nEntry;
7367           return SQLITE_OK;
7368         }
7369         moveToParent(pCur);
7370       }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
7371 
7372       pCur->aiIdx[pCur->iPage]++;
7373       pPage = pCur->apPage[pCur->iPage];
7374     }
7375 
7376     /* Descend to the child node of the cell that the cursor currently
7377     ** points at. This is the right-child if (iIdx==pPage->nCell).
7378     */
7379     iIdx = pCur->aiIdx[pCur->iPage];
7380     if( iIdx==pPage->nCell ){
7381       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
7382     }else{
7383       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
7384     }
7385   }
7386 
7387   /* An error has occurred. Return an error code. */
7388   return rc;
7389 }
7390 #endif
7391 
7392 /*
7393 ** Return the pager associated with a BTree.  This routine is used for
7394 ** testing and debugging only.
7395 */
sqlite3BtreePager(Btree * p)7396 Pager *sqlite3BtreePager(Btree *p){
7397   return p->pBt->pPager;
7398 }
7399 
7400 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7401 /*
7402 ** Append a message to the error message string.
7403 */
checkAppendMsg(IntegrityCk * pCheck,char * zMsg1,const char * zFormat,...)7404 static void checkAppendMsg(
7405   IntegrityCk *pCheck,
7406   char *zMsg1,
7407   const char *zFormat,
7408   ...
7409 ){
7410   va_list ap;
7411   if( !pCheck->mxErr ) return;
7412   pCheck->mxErr--;
7413   pCheck->nErr++;
7414   va_start(ap, zFormat);
7415   if( pCheck->errMsg.nChar ){
7416     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
7417   }
7418   if( zMsg1 ){
7419     sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
7420   }
7421   sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
7422   va_end(ap);
7423   if( pCheck->errMsg.mallocFailed ){
7424     pCheck->mallocFailed = 1;
7425   }
7426 }
7427 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7428 
7429 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7430 /*
7431 ** Add 1 to the reference count for page iPage.  If this is the second
7432 ** reference to the page, add an error message to pCheck->zErrMsg.
7433 ** Return 1 if there are 2 ore more references to the page and 0 if
7434 ** if this is the first reference to the page.
7435 **
7436 ** Also check that the page number is in bounds.
7437 */
checkRef(IntegrityCk * pCheck,Pgno iPage,char * zContext)7438 static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){
7439   if( iPage==0 ) return 1;
7440   if( iPage>pCheck->nPage ){
7441     checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
7442     return 1;
7443   }
7444   if( pCheck->anRef[iPage]==1 ){
7445     checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
7446     return 1;
7447   }
7448   return  (pCheck->anRef[iPage]++)>1;
7449 }
7450 
7451 #ifndef SQLITE_OMIT_AUTOVACUUM
7452 /*
7453 ** Check that the entry in the pointer-map for page iChild maps to
7454 ** page iParent, pointer type ptrType. If not, append an error message
7455 ** to pCheck.
7456 */
checkPtrmap(IntegrityCk * pCheck,Pgno iChild,u8 eType,Pgno iParent,char * zContext)7457 static void checkPtrmap(
7458   IntegrityCk *pCheck,   /* Integrity check context */
7459   Pgno iChild,           /* Child page number */
7460   u8 eType,              /* Expected pointer map type */
7461   Pgno iParent,          /* Expected pointer map parent page number */
7462   char *zContext         /* Context description (used for error msg) */
7463 ){
7464   int rc;
7465   u8 ePtrmapType;
7466   Pgno iPtrmapParent;
7467 
7468   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
7469   if( rc!=SQLITE_OK ){
7470     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
7471     checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
7472     return;
7473   }
7474 
7475   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
7476     checkAppendMsg(pCheck, zContext,
7477       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
7478       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
7479   }
7480 }
7481 #endif
7482 
7483 /*
7484 ** Check the integrity of the freelist or of an overflow page list.
7485 ** Verify that the number of pages on the list is N.
7486 */
checkList(IntegrityCk * pCheck,int isFreeList,int iPage,int N,char * zContext)7487 static void checkList(
7488   IntegrityCk *pCheck,  /* Integrity checking context */
7489   int isFreeList,       /* True for a freelist.  False for overflow page list */
7490   int iPage,            /* Page number for first page in the list */
7491   int N,                /* Expected number of pages in the list */
7492   char *zContext        /* Context for error messages */
7493 ){
7494   int i;
7495   int expected = N;
7496   int iFirst = iPage;
7497   while( N-- > 0 && pCheck->mxErr ){
7498     DbPage *pOvflPage;
7499     unsigned char *pOvflData;
7500     if( iPage<1 ){
7501       checkAppendMsg(pCheck, zContext,
7502          "%d of %d pages missing from overflow list starting at %d",
7503           N+1, expected, iFirst);
7504       break;
7505     }
7506     if( checkRef(pCheck, iPage, zContext) ) break;
7507     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
7508       checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
7509       break;
7510     }
7511     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
7512     if( isFreeList ){
7513       int n = get4byte(&pOvflData[4]);
7514 #ifndef SQLITE_OMIT_AUTOVACUUM
7515       if( pCheck->pBt->autoVacuum ){
7516         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
7517       }
7518 #endif
7519       if( n>(int)pCheck->pBt->usableSize/4-2 ){
7520         checkAppendMsg(pCheck, zContext,
7521            "freelist leaf count too big on page %d", iPage);
7522         N--;
7523       }else{
7524         for(i=0; i<n; i++){
7525           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
7526 #ifndef SQLITE_OMIT_AUTOVACUUM
7527           if( pCheck->pBt->autoVacuum ){
7528             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
7529           }
7530 #endif
7531           checkRef(pCheck, iFreePage, zContext);
7532         }
7533         N -= n;
7534       }
7535     }
7536 #ifndef SQLITE_OMIT_AUTOVACUUM
7537     else{
7538       /* If this database supports auto-vacuum and iPage is not the last
7539       ** page in this overflow list, check that the pointer-map entry for
7540       ** the following page matches iPage.
7541       */
7542       if( pCheck->pBt->autoVacuum && N>0 ){
7543         i = get4byte(pOvflData);
7544         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
7545       }
7546     }
7547 #endif
7548     iPage = get4byte(pOvflData);
7549     sqlite3PagerUnref(pOvflPage);
7550   }
7551 }
7552 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7553 
7554 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7555 /*
7556 ** Do various sanity checks on a single page of a tree.  Return
7557 ** the tree depth.  Root pages return 0.  Parents of root pages
7558 ** return 1, and so forth.
7559 **
7560 ** These checks are done:
7561 **
7562 **      1.  Make sure that cells and freeblocks do not overlap
7563 **          but combine to completely cover the page.
7564 **  NO  2.  Make sure cell keys are in order.
7565 **  NO  3.  Make sure no key is less than or equal to zLowerBound.
7566 **  NO  4.  Make sure no key is greater than or equal to zUpperBound.
7567 **      5.  Check the integrity of overflow pages.
7568 **      6.  Recursively call checkTreePage on all children.
7569 **      7.  Verify that the depth of all children is the same.
7570 **      8.  Make sure this page is at least 33% full or else it is
7571 **          the root of the tree.
7572 */
checkTreePage(IntegrityCk * pCheck,int iPage,char * zParentContext,i64 * pnParentMinKey,i64 * pnParentMaxKey)7573 static int checkTreePage(
7574   IntegrityCk *pCheck,  /* Context for the sanity check */
7575   int iPage,            /* Page number of the page to check */
7576   char *zParentContext, /* Parent context */
7577   i64 *pnParentMinKey,
7578   i64 *pnParentMaxKey
7579 ){
7580   MemPage *pPage;
7581   int i, rc, depth, d2, pgno, cnt;
7582   int hdr, cellStart;
7583   int nCell;
7584   u8 *data;
7585   BtShared *pBt;
7586   int usableSize;
7587   char zContext[100];
7588   char *hit = 0;
7589   i64 nMinKey = 0;
7590   i64 nMaxKey = 0;
7591 
7592   sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
7593 
7594   /* Check that the page exists
7595   */
7596   pBt = pCheck->pBt;
7597   usableSize = pBt->usableSize;
7598   if( iPage==0 ) return 0;
7599   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
7600   if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
7601     checkAppendMsg(pCheck, zContext,
7602        "unable to get the page. error code=%d", rc);
7603     return 0;
7604   }
7605 
7606   /* Clear MemPage.isInit to make sure the corruption detection code in
7607   ** btreeInitPage() is executed.  */
7608   pPage->isInit = 0;
7609   if( (rc = btreeInitPage(pPage))!=0 ){
7610     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
7611     checkAppendMsg(pCheck, zContext,
7612                    "btreeInitPage() returns error code %d", rc);
7613     releasePage(pPage);
7614     return 0;
7615   }
7616 
7617   /* Check out all the cells.
7618   */
7619   depth = 0;
7620   for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
7621     u8 *pCell;
7622     u32 sz;
7623     CellInfo info;
7624 
7625     /* Check payload overflow pages
7626     */
7627     sqlite3_snprintf(sizeof(zContext), zContext,
7628              "On tree page %d cell %d: ", iPage, i);
7629     pCell = findCell(pPage,i);
7630     btreeParseCellPtr(pPage, pCell, &info);
7631     sz = info.nData;
7632     if( !pPage->intKey ) sz += (int)info.nKey;
7633     /* For intKey pages, check that the keys are in order.
7634     */
7635     else if( i==0 ) nMinKey = nMaxKey = info.nKey;
7636     else{
7637       if( info.nKey <= nMaxKey ){
7638         checkAppendMsg(pCheck, zContext,
7639             "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey);
7640       }
7641       nMaxKey = info.nKey;
7642     }
7643     assert( sz==info.nPayload );
7644     if( (sz>info.nLocal)
7645      && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize])
7646     ){
7647       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
7648       Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
7649 #ifndef SQLITE_OMIT_AUTOVACUUM
7650       if( pBt->autoVacuum ){
7651         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
7652       }
7653 #endif
7654       checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
7655     }
7656 
7657     /* Check sanity of left child page.
7658     */
7659     if( !pPage->leaf ){
7660       pgno = get4byte(pCell);
7661 #ifndef SQLITE_OMIT_AUTOVACUUM
7662       if( pBt->autoVacuum ){
7663         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
7664       }
7665 #endif
7666       d2 = checkTreePage(pCheck, pgno, zContext, &nMinKey, i==0 ? NULL : &nMaxKey);
7667       if( i>0 && d2!=depth ){
7668         checkAppendMsg(pCheck, zContext, "Child page depth differs");
7669       }
7670       depth = d2;
7671     }
7672   }
7673 
7674   if( !pPage->leaf ){
7675     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
7676     sqlite3_snprintf(sizeof(zContext), zContext,
7677                      "On page %d at right child: ", iPage);
7678 #ifndef SQLITE_OMIT_AUTOVACUUM
7679     if( pBt->autoVacuum ){
7680       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
7681     }
7682 #endif
7683     checkTreePage(pCheck, pgno, zContext, NULL, !pPage->nCell ? NULL : &nMaxKey);
7684   }
7685 
7686   /* For intKey leaf pages, check that the min/max keys are in order
7687   ** with any left/parent/right pages.
7688   */
7689   if( pPage->leaf && pPage->intKey ){
7690     /* if we are a left child page */
7691     if( pnParentMinKey ){
7692       /* if we are the left most child page */
7693       if( !pnParentMaxKey ){
7694         if( nMaxKey > *pnParentMinKey ){
7695           checkAppendMsg(pCheck, zContext,
7696               "Rowid %lld out of order (max larger than parent min of %lld)",
7697               nMaxKey, *pnParentMinKey);
7698         }
7699       }else{
7700         if( nMinKey <= *pnParentMinKey ){
7701           checkAppendMsg(pCheck, zContext,
7702               "Rowid %lld out of order (min less than parent min of %lld)",
7703               nMinKey, *pnParentMinKey);
7704         }
7705         if( nMaxKey > *pnParentMaxKey ){
7706           checkAppendMsg(pCheck, zContext,
7707               "Rowid %lld out of order (max larger than parent max of %lld)",
7708               nMaxKey, *pnParentMaxKey);
7709         }
7710         *pnParentMinKey = nMaxKey;
7711       }
7712     /* else if we're a right child page */
7713     } else if( pnParentMaxKey ){
7714       if( nMinKey <= *pnParentMaxKey ){
7715         checkAppendMsg(pCheck, zContext,
7716             "Rowid %lld out of order (min less than parent max of %lld)",
7717             nMinKey, *pnParentMaxKey);
7718       }
7719     }
7720   }
7721 
7722   /* Check for complete coverage of the page
7723   */
7724   data = pPage->aData;
7725   hdr = pPage->hdrOffset;
7726   hit = sqlite3PageMalloc( pBt->pageSize );
7727   if( hit==0 ){
7728     pCheck->mallocFailed = 1;
7729   }else{
7730     int contentOffset = get2byteNotZero(&data[hdr+5]);
7731     assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
7732     memset(hit+contentOffset, 0, usableSize-contentOffset);
7733     memset(hit, 1, contentOffset);
7734     nCell = get2byte(&data[hdr+3]);
7735     cellStart = hdr + 12 - 4*pPage->leaf;
7736     for(i=0; i<nCell; i++){
7737       int pc = get2byte(&data[cellStart+i*2]);
7738       u32 size = 65536;
7739       int j;
7740       if( pc<=usableSize-4 ){
7741         size = cellSizePtr(pPage, &data[pc]);
7742       }
7743       if( (int)(pc+size-1)>=usableSize ){
7744         checkAppendMsg(pCheck, 0,
7745             "Corruption detected in cell %d on page %d",i,iPage);
7746       }else{
7747         for(j=pc+size-1; j>=pc; j--) hit[j]++;
7748       }
7749     }
7750     i = get2byte(&data[hdr+1]);
7751     while( i>0 ){
7752       int size, j;
7753       assert( i<=usableSize-4 );     /* Enforced by btreeInitPage() */
7754       size = get2byte(&data[i+2]);
7755       assert( i+size<=usableSize );  /* Enforced by btreeInitPage() */
7756       for(j=i+size-1; j>=i; j--) hit[j]++;
7757       j = get2byte(&data[i]);
7758       assert( j==0 || j>i+size );  /* Enforced by btreeInitPage() */
7759       assert( j<=usableSize-4 );   /* Enforced by btreeInitPage() */
7760       i = j;
7761     }
7762     for(i=cnt=0; i<usableSize; i++){
7763       if( hit[i]==0 ){
7764         cnt++;
7765       }else if( hit[i]>1 ){
7766         checkAppendMsg(pCheck, 0,
7767           "Multiple uses for byte %d of page %d", i, iPage);
7768         break;
7769       }
7770     }
7771     if( cnt!=data[hdr+7] ){
7772       checkAppendMsg(pCheck, 0,
7773           "Fragmentation of %d bytes reported as %d on page %d",
7774           cnt, data[hdr+7], iPage);
7775     }
7776   }
7777   sqlite3PageFree(hit);
7778   releasePage(pPage);
7779   return depth+1;
7780 }
7781 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7782 
7783 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7784 /*
7785 ** This routine does a complete check of the given BTree file.  aRoot[] is
7786 ** an array of pages numbers were each page number is the root page of
7787 ** a table.  nRoot is the number of entries in aRoot.
7788 **
7789 ** A read-only or read-write transaction must be opened before calling
7790 ** this function.
7791 **
7792 ** Write the number of error seen in *pnErr.  Except for some memory
7793 ** allocation errors,  an error message held in memory obtained from
7794 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
7795 ** returned.  If a memory allocation error occurs, NULL is returned.
7796 */
sqlite3BtreeIntegrityCheck(Btree * p,int * aRoot,int nRoot,int mxErr,int * pnErr)7797 char *sqlite3BtreeIntegrityCheck(
7798   Btree *p,     /* The btree to be checked */
7799   int *aRoot,   /* An array of root pages numbers for individual trees */
7800   int nRoot,    /* Number of entries in aRoot[] */
7801   int mxErr,    /* Stop reporting errors after this many */
7802   int *pnErr    /* Write number of errors seen to this variable */
7803 ){
7804   Pgno i;
7805   int nRef;
7806   IntegrityCk sCheck;
7807   BtShared *pBt = p->pBt;
7808   char zErr[100];
7809 
7810   sqlite3BtreeEnter(p);
7811   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
7812   nRef = sqlite3PagerRefcount(pBt->pPager);
7813   sCheck.pBt = pBt;
7814   sCheck.pPager = pBt->pPager;
7815   sCheck.nPage = btreePagecount(sCheck.pBt);
7816   sCheck.mxErr = mxErr;
7817   sCheck.nErr = 0;
7818   sCheck.mallocFailed = 0;
7819   *pnErr = 0;
7820   if( sCheck.nPage==0 ){
7821     sqlite3BtreeLeave(p);
7822     return 0;
7823   }
7824   sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
7825   if( !sCheck.anRef ){
7826     *pnErr = 1;
7827     sqlite3BtreeLeave(p);
7828     return 0;
7829   }
7830   for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
7831   i = PENDING_BYTE_PAGE(pBt);
7832   if( i<=sCheck.nPage ){
7833     sCheck.anRef[i] = 1;
7834   }
7835   sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
7836   sCheck.errMsg.useMalloc = 2;
7837 
7838   /* Check the integrity of the freelist
7839   */
7840   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
7841             get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
7842 
7843   /* Check all the tables.
7844   */
7845   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
7846     if( aRoot[i]==0 ) continue;
7847 #ifndef SQLITE_OMIT_AUTOVACUUM
7848     if( pBt->autoVacuum && aRoot[i]>1 ){
7849       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
7850     }
7851 #endif
7852     checkTreePage(&sCheck, aRoot[i], "List of tree roots: ", NULL, NULL);
7853   }
7854 
7855   /* Make sure every page in the file is referenced
7856   */
7857   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
7858 #ifdef SQLITE_OMIT_AUTOVACUUM
7859     if( sCheck.anRef[i]==0 ){
7860       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
7861     }
7862 #else
7863     /* If the database supports auto-vacuum, make sure no tables contain
7864     ** references to pointer-map pages.
7865     */
7866     if( sCheck.anRef[i]==0 &&
7867        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
7868       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
7869     }
7870     if( sCheck.anRef[i]!=0 &&
7871        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
7872       checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
7873     }
7874 #endif
7875   }
7876 
7877   /* Make sure this analysis did not leave any unref() pages.
7878   ** This is an internal consistency check; an integrity check
7879   ** of the integrity check.
7880   */
7881   if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){
7882     checkAppendMsg(&sCheck, 0,
7883       "Outstanding page count goes from %d to %d during this analysis",
7884       nRef, sqlite3PagerRefcount(pBt->pPager)
7885     );
7886   }
7887 
7888   /* Clean  up and report errors.
7889   */
7890   sqlite3BtreeLeave(p);
7891   sqlite3_free(sCheck.anRef);
7892   if( sCheck.mallocFailed ){
7893     sqlite3StrAccumReset(&sCheck.errMsg);
7894     *pnErr = sCheck.nErr+1;
7895     return 0;
7896   }
7897   *pnErr = sCheck.nErr;
7898   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
7899   return sqlite3StrAccumFinish(&sCheck.errMsg);
7900 }
7901 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7902 
7903 /*
7904 ** Return the full pathname of the underlying database file.
7905 **
7906 ** The pager filename is invariant as long as the pager is
7907 ** open so it is safe to access without the BtShared mutex.
7908 */
sqlite3BtreeGetFilename(Btree * p)7909 const char *sqlite3BtreeGetFilename(Btree *p){
7910   assert( p->pBt->pPager!=0 );
7911   return sqlite3PagerFilename(p->pBt->pPager);
7912 }
7913 
7914 /*
7915 ** Return the pathname of the journal file for this database. The return
7916 ** value of this routine is the same regardless of whether the journal file
7917 ** has been created or not.
7918 **
7919 ** The pager journal filename is invariant as long as the pager is
7920 ** open so it is safe to access without the BtShared mutex.
7921 */
sqlite3BtreeGetJournalname(Btree * p)7922 const char *sqlite3BtreeGetJournalname(Btree *p){
7923   assert( p->pBt->pPager!=0 );
7924   return sqlite3PagerJournalname(p->pBt->pPager);
7925 }
7926 
7927 /*
7928 ** Return non-zero if a transaction is active.
7929 */
sqlite3BtreeIsInTrans(Btree * p)7930 int sqlite3BtreeIsInTrans(Btree *p){
7931   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
7932   return (p && (p->inTrans==TRANS_WRITE));
7933 }
7934 
7935 #ifndef SQLITE_OMIT_WAL
7936 /*
7937 ** Run a checkpoint on the Btree passed as the first argument.
7938 **
7939 ** Return SQLITE_LOCKED if this or any other connection has an open
7940 ** transaction on the shared-cache the argument Btree is connected to.
7941 **
7942 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
7943 */
sqlite3BtreeCheckpoint(Btree * p,int eMode,int * pnLog,int * pnCkpt)7944 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
7945   int rc = SQLITE_OK;
7946   if( p ){
7947     BtShared *pBt = p->pBt;
7948     sqlite3BtreeEnter(p);
7949     if( pBt->inTransaction!=TRANS_NONE ){
7950       rc = SQLITE_LOCKED;
7951     }else{
7952       rc = sqlite3PagerCheckpoint(pBt->pPager, eMode, pnLog, pnCkpt);
7953     }
7954     sqlite3BtreeLeave(p);
7955   }
7956   return rc;
7957 }
7958 #endif
7959 
7960 /*
7961 ** Return non-zero if a read (or write) transaction is active.
7962 */
sqlite3BtreeIsInReadTrans(Btree * p)7963 int sqlite3BtreeIsInReadTrans(Btree *p){
7964   assert( p );
7965   assert( sqlite3_mutex_held(p->db->mutex) );
7966   return p->inTrans!=TRANS_NONE;
7967 }
7968 
sqlite3BtreeIsInBackup(Btree * p)7969 int sqlite3BtreeIsInBackup(Btree *p){
7970   assert( p );
7971   assert( sqlite3_mutex_held(p->db->mutex) );
7972   return p->nBackup!=0;
7973 }
7974 
7975 /*
7976 ** This function returns a pointer to a blob of memory associated with
7977 ** a single shared-btree. The memory is used by client code for its own
7978 ** purposes (for example, to store a high-level schema associated with
7979 ** the shared-btree). The btree layer manages reference counting issues.
7980 **
7981 ** The first time this is called on a shared-btree, nBytes bytes of memory
7982 ** are allocated, zeroed, and returned to the caller. For each subsequent
7983 ** call the nBytes parameter is ignored and a pointer to the same blob
7984 ** of memory returned.
7985 **
7986 ** If the nBytes parameter is 0 and the blob of memory has not yet been
7987 ** allocated, a null pointer is returned. If the blob has already been
7988 ** allocated, it is returned as normal.
7989 **
7990 ** Just before the shared-btree is closed, the function passed as the
7991 ** xFree argument when the memory allocation was made is invoked on the
7992 ** blob of allocated memory. The xFree function should not call sqlite3_free()
7993 ** on the memory, the btree layer does that.
7994 */
sqlite3BtreeSchema(Btree * p,int nBytes,void (* xFree)(void *))7995 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
7996   BtShared *pBt = p->pBt;
7997   sqlite3BtreeEnter(p);
7998   if( !pBt->pSchema && nBytes ){
7999     pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
8000     pBt->xFreeSchema = xFree;
8001   }
8002   sqlite3BtreeLeave(p);
8003   return pBt->pSchema;
8004 }
8005 
8006 /*
8007 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
8008 ** btree as the argument handle holds an exclusive lock on the
8009 ** sqlite_master table. Otherwise SQLITE_OK.
8010 */
sqlite3BtreeSchemaLocked(Btree * p)8011 int sqlite3BtreeSchemaLocked(Btree *p){
8012   int rc;
8013   assert( sqlite3_mutex_held(p->db->mutex) );
8014   sqlite3BtreeEnter(p);
8015   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
8016   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
8017   sqlite3BtreeLeave(p);
8018   return rc;
8019 }
8020 
8021 
8022 #ifndef SQLITE_OMIT_SHARED_CACHE
8023 /*
8024 ** Obtain a lock on the table whose root page is iTab.  The
8025 ** lock is a write lock if isWritelock is true or a read lock
8026 ** if it is false.
8027 */
sqlite3BtreeLockTable(Btree * p,int iTab,u8 isWriteLock)8028 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
8029   int rc = SQLITE_OK;
8030   assert( p->inTrans!=TRANS_NONE );
8031   if( p->sharable ){
8032     u8 lockType = READ_LOCK + isWriteLock;
8033     assert( READ_LOCK+1==WRITE_LOCK );
8034     assert( isWriteLock==0 || isWriteLock==1 );
8035 
8036     sqlite3BtreeEnter(p);
8037     rc = querySharedCacheTableLock(p, iTab, lockType);
8038     if( rc==SQLITE_OK ){
8039       rc = setSharedCacheTableLock(p, iTab, lockType);
8040     }
8041     sqlite3BtreeLeave(p);
8042   }
8043   return rc;
8044 }
8045 #endif
8046 
8047 #ifndef SQLITE_OMIT_INCRBLOB
8048 /*
8049 ** Argument pCsr must be a cursor opened for writing on an
8050 ** INTKEY table currently pointing at a valid table entry.
8051 ** This function modifies the data stored as part of that entry.
8052 **
8053 ** Only the data content may only be modified, it is not possible to
8054 ** change the length of the data stored. If this function is called with
8055 ** parameters that attempt to write past the end of the existing data,
8056 ** no modifications are made and SQLITE_CORRUPT is returned.
8057 */
sqlite3BtreePutData(BtCursor * pCsr,u32 offset,u32 amt,void * z)8058 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
8059   int rc;
8060   assert( cursorHoldsMutex(pCsr) );
8061   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
8062   assert( pCsr->isIncrblobHandle );
8063 
8064   rc = restoreCursorPosition(pCsr);
8065   if( rc!=SQLITE_OK ){
8066     return rc;
8067   }
8068   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
8069   if( pCsr->eState!=CURSOR_VALID ){
8070     return SQLITE_ABORT;
8071   }
8072 
8073   /* Check some assumptions:
8074   **   (a) the cursor is open for writing,
8075   **   (b) there is a read/write transaction open,
8076   **   (c) the connection holds a write-lock on the table (if required),
8077   **   (d) there are no conflicting read-locks, and
8078   **   (e) the cursor points at a valid row of an intKey table.
8079   */
8080   if( !pCsr->wrFlag ){
8081     return SQLITE_READONLY;
8082   }
8083   assert( !pCsr->pBt->readOnly && pCsr->pBt->inTransaction==TRANS_WRITE );
8084   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
8085   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
8086   assert( pCsr->apPage[pCsr->iPage]->intKey );
8087 
8088   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
8089 }
8090 
8091 /*
8092 ** Set a flag on this cursor to cache the locations of pages from the
8093 ** overflow list for the current row. This is used by cursors opened
8094 ** for incremental blob IO only.
8095 **
8096 ** This function sets a flag only. The actual page location cache
8097 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
8098 ** accessPayload() (the worker function for sqlite3BtreeData() and
8099 ** sqlite3BtreePutData()).
8100 */
sqlite3BtreeCacheOverflow(BtCursor * pCur)8101 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
8102   assert( cursorHoldsMutex(pCur) );
8103   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
8104   invalidateOverflowCache(pCur);
8105   pCur->isIncrblobHandle = 1;
8106 }
8107 #endif
8108 
8109 /*
8110 ** Set both the "read version" (single byte at byte offset 18) and
8111 ** "write version" (single byte at byte offset 19) fields in the database
8112 ** header to iVersion.
8113 */
sqlite3BtreeSetVersion(Btree * pBtree,int iVersion)8114 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
8115   BtShared *pBt = pBtree->pBt;
8116   int rc;                         /* Return code */
8117 
8118   assert( pBtree->inTrans==TRANS_NONE );
8119   assert( iVersion==1 || iVersion==2 );
8120 
8121   /* If setting the version fields to 1, do not automatically open the
8122   ** WAL connection, even if the version fields are currently set to 2.
8123   */
8124   pBt->doNotUseWAL = (u8)(iVersion==1);
8125 
8126   rc = sqlite3BtreeBeginTrans(pBtree, 0);
8127   if( rc==SQLITE_OK ){
8128     u8 *aData = pBt->pPage1->aData;
8129     if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
8130       rc = sqlite3BtreeBeginTrans(pBtree, 2);
8131       if( rc==SQLITE_OK ){
8132         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
8133         if( rc==SQLITE_OK ){
8134           aData[18] = (u8)iVersion;
8135           aData[19] = (u8)iVersion;
8136         }
8137       }
8138     }
8139   }
8140 
8141   pBt->doNotUseWAL = 0;
8142   return rc;
8143 }
8144