1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements a external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
15 */
16 #include "btreeInt.h"
17
18 /*
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
21 */
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
23
24 /*
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
27 */
28 #if 0
29 int sqlite3BtreeTrace=1; /* True to enable tracing */
30 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
34
35 /*
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
38 **
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page. If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
43 */
44 #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1)
45
46 #ifndef SQLITE_OMIT_SHARED_CACHE
47 /*
48 ** A list of BtShared objects that are eligible for participation
49 ** in shared cache. This variable has file scope during normal builds,
50 ** but the test harness needs to access it so we make it global for
51 ** test builds.
52 **
53 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
54 */
55 #ifdef SQLITE_TEST
56 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
57 #else
58 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
59 #endif
60 #endif /* SQLITE_OMIT_SHARED_CACHE */
61
62 #ifndef SQLITE_OMIT_SHARED_CACHE
63 /*
64 ** Enable or disable the shared pager and schema features.
65 **
66 ** This routine has no effect on existing database connections.
67 ** The shared cache setting effects only future calls to
68 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
69 */
sqlite3_enable_shared_cache(int enable)70 int sqlite3_enable_shared_cache(int enable){
71 sqlite3GlobalConfig.sharedCacheEnabled = enable;
72 return SQLITE_OK;
73 }
74 #endif
75
76
77
78 #ifdef SQLITE_OMIT_SHARED_CACHE
79 /*
80 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
81 ** and clearAllSharedCacheTableLocks()
82 ** manipulate entries in the BtShared.pLock linked list used to store
83 ** shared-cache table level locks. If the library is compiled with the
84 ** shared-cache feature disabled, then there is only ever one user
85 ** of each BtShared structure and so this locking is not necessary.
86 ** So define the lock related functions as no-ops.
87 */
88 #define querySharedCacheTableLock(a,b,c) SQLITE_OK
89 #define setSharedCacheTableLock(a,b,c) SQLITE_OK
90 #define clearAllSharedCacheTableLocks(a)
91 #define downgradeAllSharedCacheTableLocks(a)
92 #define hasSharedCacheTableLock(a,b,c,d) 1
93 #define hasReadConflicts(a, b) 0
94 #endif
95
96 #ifndef SQLITE_OMIT_SHARED_CACHE
97
98 #ifdef SQLITE_DEBUG
99 /*
100 **** This function is only used as part of an assert() statement. ***
101 **
102 ** Check to see if pBtree holds the required locks to read or write to the
103 ** table with root page iRoot. Return 1 if it does and 0 if not.
104 **
105 ** For example, when writing to a table with root-page iRoot via
106 ** Btree connection pBtree:
107 **
108 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
109 **
110 ** When writing to an index that resides in a sharable database, the
111 ** caller should have first obtained a lock specifying the root page of
112 ** the corresponding table. This makes things a bit more complicated,
113 ** as this module treats each table as a separate structure. To determine
114 ** the table corresponding to the index being written, this
115 ** function has to search through the database schema.
116 **
117 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
118 ** hold a write-lock on the schema table (root page 1). This is also
119 ** acceptable.
120 */
hasSharedCacheTableLock(Btree * pBtree,Pgno iRoot,int isIndex,int eLockType)121 static int hasSharedCacheTableLock(
122 Btree *pBtree, /* Handle that must hold lock */
123 Pgno iRoot, /* Root page of b-tree */
124 int isIndex, /* True if iRoot is the root of an index b-tree */
125 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */
126 ){
127 Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
128 Pgno iTab = 0;
129 BtLock *pLock;
130
131 /* If this database is not shareable, or if the client is reading
132 ** and has the read-uncommitted flag set, then no lock is required.
133 ** Return true immediately.
134 */
135 if( (pBtree->sharable==0)
136 || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
137 ){
138 return 1;
139 }
140
141 /* If the client is reading or writing an index and the schema is
142 ** not loaded, then it is too difficult to actually check to see if
143 ** the correct locks are held. So do not bother - just return true.
144 ** This case does not come up very often anyhow.
145 */
146 if( isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0) ){
147 return 1;
148 }
149
150 /* Figure out the root-page that the lock should be held on. For table
151 ** b-trees, this is just the root page of the b-tree being read or
152 ** written. For index b-trees, it is the root page of the associated
153 ** table. */
154 if( isIndex ){
155 HashElem *p;
156 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
157 Index *pIdx = (Index *)sqliteHashData(p);
158 if( pIdx->tnum==(int)iRoot ){
159 iTab = pIdx->pTable->tnum;
160 }
161 }
162 }else{
163 iTab = iRoot;
164 }
165
166 /* Search for the required lock. Either a write-lock on root-page iTab, a
167 ** write-lock on the schema table, or (if the client is reading) a
168 ** read-lock on iTab will suffice. Return 1 if any of these are found. */
169 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
170 if( pLock->pBtree==pBtree
171 && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
172 && pLock->eLock>=eLockType
173 ){
174 return 1;
175 }
176 }
177
178 /* Failed to find the required lock. */
179 return 0;
180 }
181 #endif /* SQLITE_DEBUG */
182
183 #ifdef SQLITE_DEBUG
184 /*
185 **** This function may be used as part of assert() statements only. ****
186 **
187 ** Return true if it would be illegal for pBtree to write into the
188 ** table or index rooted at iRoot because other shared connections are
189 ** simultaneously reading that same table or index.
190 **
191 ** It is illegal for pBtree to write if some other Btree object that
192 ** shares the same BtShared object is currently reading or writing
193 ** the iRoot table. Except, if the other Btree object has the
194 ** read-uncommitted flag set, then it is OK for the other object to
195 ** have a read cursor.
196 **
197 ** For example, before writing to any part of the table or index
198 ** rooted at page iRoot, one should call:
199 **
200 ** assert( !hasReadConflicts(pBtree, iRoot) );
201 */
hasReadConflicts(Btree * pBtree,Pgno iRoot)202 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
203 BtCursor *p;
204 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
205 if( p->pgnoRoot==iRoot
206 && p->pBtree!=pBtree
207 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
208 ){
209 return 1;
210 }
211 }
212 return 0;
213 }
214 #endif /* #ifdef SQLITE_DEBUG */
215
216 /*
217 ** Query to see if Btree handle p may obtain a lock of type eLock
218 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
219 ** SQLITE_OK if the lock may be obtained (by calling
220 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
221 */
querySharedCacheTableLock(Btree * p,Pgno iTab,u8 eLock)222 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
223 BtShared *pBt = p->pBt;
224 BtLock *pIter;
225
226 assert( sqlite3BtreeHoldsMutex(p) );
227 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
228 assert( p->db!=0 );
229 assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
230
231 /* If requesting a write-lock, then the Btree must have an open write
232 ** transaction on this file. And, obviously, for this to be so there
233 ** must be an open write transaction on the file itself.
234 */
235 assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
236 assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
237
238 /* This routine is a no-op if the shared-cache is not enabled */
239 if( !p->sharable ){
240 return SQLITE_OK;
241 }
242
243 /* If some other connection is holding an exclusive lock, the
244 ** requested lock may not be obtained.
245 */
246 if( pBt->pWriter!=p && pBt->isExclusive ){
247 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
248 return SQLITE_LOCKED_SHAREDCACHE;
249 }
250
251 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
252 /* The condition (pIter->eLock!=eLock) in the following if(...)
253 ** statement is a simplification of:
254 **
255 ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
256 **
257 ** since we know that if eLock==WRITE_LOCK, then no other connection
258 ** may hold a WRITE_LOCK on any table in this file (since there can
259 ** only be a single writer).
260 */
261 assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
262 assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
263 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
264 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
265 if( eLock==WRITE_LOCK ){
266 assert( p==pBt->pWriter );
267 pBt->isPending = 1;
268 }
269 return SQLITE_LOCKED_SHAREDCACHE;
270 }
271 }
272 return SQLITE_OK;
273 }
274 #endif /* !SQLITE_OMIT_SHARED_CACHE */
275
276 #ifndef SQLITE_OMIT_SHARED_CACHE
277 /*
278 ** Add a lock on the table with root-page iTable to the shared-btree used
279 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
280 ** WRITE_LOCK.
281 **
282 ** This function assumes the following:
283 **
284 ** (a) The specified Btree object p is connected to a sharable
285 ** database (one with the BtShared.sharable flag set), and
286 **
287 ** (b) No other Btree objects hold a lock that conflicts
288 ** with the requested lock (i.e. querySharedCacheTableLock() has
289 ** already been called and returned SQLITE_OK).
290 **
291 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
292 ** is returned if a malloc attempt fails.
293 */
setSharedCacheTableLock(Btree * p,Pgno iTable,u8 eLock)294 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
295 BtShared *pBt = p->pBt;
296 BtLock *pLock = 0;
297 BtLock *pIter;
298
299 assert( sqlite3BtreeHoldsMutex(p) );
300 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
301 assert( p->db!=0 );
302
303 /* A connection with the read-uncommitted flag set will never try to
304 ** obtain a read-lock using this function. The only read-lock obtained
305 ** by a connection in read-uncommitted mode is on the sqlite_master
306 ** table, and that lock is obtained in BtreeBeginTrans(). */
307 assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
308
309 /* This function should only be called on a sharable b-tree after it
310 ** has been determined that no other b-tree holds a conflicting lock. */
311 assert( p->sharable );
312 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
313
314 /* First search the list for an existing lock on this table. */
315 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
316 if( pIter->iTable==iTable && pIter->pBtree==p ){
317 pLock = pIter;
318 break;
319 }
320 }
321
322 /* If the above search did not find a BtLock struct associating Btree p
323 ** with table iTable, allocate one and link it into the list.
324 */
325 if( !pLock ){
326 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
327 if( !pLock ){
328 return SQLITE_NOMEM;
329 }
330 pLock->iTable = iTable;
331 pLock->pBtree = p;
332 pLock->pNext = pBt->pLock;
333 pBt->pLock = pLock;
334 }
335
336 /* Set the BtLock.eLock variable to the maximum of the current lock
337 ** and the requested lock. This means if a write-lock was already held
338 ** and a read-lock requested, we don't incorrectly downgrade the lock.
339 */
340 assert( WRITE_LOCK>READ_LOCK );
341 if( eLock>pLock->eLock ){
342 pLock->eLock = eLock;
343 }
344
345 return SQLITE_OK;
346 }
347 #endif /* !SQLITE_OMIT_SHARED_CACHE */
348
349 #ifndef SQLITE_OMIT_SHARED_CACHE
350 /*
351 ** Release all the table locks (locks obtained via calls to
352 ** the setSharedCacheTableLock() procedure) held by Btree object p.
353 **
354 ** This function assumes that Btree p has an open read or write
355 ** transaction. If it does not, then the BtShared.isPending variable
356 ** may be incorrectly cleared.
357 */
clearAllSharedCacheTableLocks(Btree * p)358 static void clearAllSharedCacheTableLocks(Btree *p){
359 BtShared *pBt = p->pBt;
360 BtLock **ppIter = &pBt->pLock;
361
362 assert( sqlite3BtreeHoldsMutex(p) );
363 assert( p->sharable || 0==*ppIter );
364 assert( p->inTrans>0 );
365
366 while( *ppIter ){
367 BtLock *pLock = *ppIter;
368 assert( pBt->isExclusive==0 || pBt->pWriter==pLock->pBtree );
369 assert( pLock->pBtree->inTrans>=pLock->eLock );
370 if( pLock->pBtree==p ){
371 *ppIter = pLock->pNext;
372 assert( pLock->iTable!=1 || pLock==&p->lock );
373 if( pLock->iTable!=1 ){
374 sqlite3_free(pLock);
375 }
376 }else{
377 ppIter = &pLock->pNext;
378 }
379 }
380
381 assert( pBt->isPending==0 || pBt->pWriter );
382 if( pBt->pWriter==p ){
383 pBt->pWriter = 0;
384 pBt->isExclusive = 0;
385 pBt->isPending = 0;
386 }else if( pBt->nTransaction==2 ){
387 /* This function is called when Btree p is concluding its
388 ** transaction. If there currently exists a writer, and p is not
389 ** that writer, then the number of locks held by connections other
390 ** than the writer must be about to drop to zero. In this case
391 ** set the isPending flag to 0.
392 **
393 ** If there is not currently a writer, then BtShared.isPending must
394 ** be zero already. So this next line is harmless in that case.
395 */
396 pBt->isPending = 0;
397 }
398 }
399
400 /*
401 ** This function changes all write-locks held by Btree p into read-locks.
402 */
downgradeAllSharedCacheTableLocks(Btree * p)403 static void downgradeAllSharedCacheTableLocks(Btree *p){
404 BtShared *pBt = p->pBt;
405 if( pBt->pWriter==p ){
406 BtLock *pLock;
407 pBt->pWriter = 0;
408 pBt->isExclusive = 0;
409 pBt->isPending = 0;
410 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
411 assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
412 pLock->eLock = READ_LOCK;
413 }
414 }
415 }
416
417 #endif /* SQLITE_OMIT_SHARED_CACHE */
418
419 static void releasePage(MemPage *pPage); /* Forward reference */
420
421 /*
422 ***** This routine is used inside of assert() only ****
423 **
424 ** Verify that the cursor holds the mutex on its BtShared
425 */
426 #ifdef SQLITE_DEBUG
cursorHoldsMutex(BtCursor * p)427 static int cursorHoldsMutex(BtCursor *p){
428 return sqlite3_mutex_held(p->pBt->mutex);
429 }
430 #endif
431
432
433 #ifndef SQLITE_OMIT_INCRBLOB
434 /*
435 ** Invalidate the overflow page-list cache for cursor pCur, if any.
436 */
invalidateOverflowCache(BtCursor * pCur)437 static void invalidateOverflowCache(BtCursor *pCur){
438 assert( cursorHoldsMutex(pCur) );
439 sqlite3_free(pCur->aOverflow);
440 pCur->aOverflow = 0;
441 }
442
443 /*
444 ** Invalidate the overflow page-list cache for all cursors opened
445 ** on the shared btree structure pBt.
446 */
invalidateAllOverflowCache(BtShared * pBt)447 static void invalidateAllOverflowCache(BtShared *pBt){
448 BtCursor *p;
449 assert( sqlite3_mutex_held(pBt->mutex) );
450 for(p=pBt->pCursor; p; p=p->pNext){
451 invalidateOverflowCache(p);
452 }
453 }
454
455 /*
456 ** This function is called before modifying the contents of a table
457 ** to invalidate any incrblob cursors that are open on the
458 ** row or one of the rows being modified.
459 **
460 ** If argument isClearTable is true, then the entire contents of the
461 ** table is about to be deleted. In this case invalidate all incrblob
462 ** cursors open on any row within the table with root-page pgnoRoot.
463 **
464 ** Otherwise, if argument isClearTable is false, then the row with
465 ** rowid iRow is being replaced or deleted. In this case invalidate
466 ** only those incrblob cursors open on that specific row.
467 */
invalidateIncrblobCursors(Btree * pBtree,i64 iRow,int isClearTable)468 static void invalidateIncrblobCursors(
469 Btree *pBtree, /* The database file to check */
470 i64 iRow, /* The rowid that might be changing */
471 int isClearTable /* True if all rows are being deleted */
472 ){
473 BtCursor *p;
474 BtShared *pBt = pBtree->pBt;
475 assert( sqlite3BtreeHoldsMutex(pBtree) );
476 for(p=pBt->pCursor; p; p=p->pNext){
477 if( p->isIncrblobHandle && (isClearTable || p->info.nKey==iRow) ){
478 p->eState = CURSOR_INVALID;
479 }
480 }
481 }
482
483 #else
484 /* Stub functions when INCRBLOB is omitted */
485 #define invalidateOverflowCache(x)
486 #define invalidateAllOverflowCache(x)
487 #define invalidateIncrblobCursors(x,y,z)
488 #endif /* SQLITE_OMIT_INCRBLOB */
489
490 /*
491 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
492 ** when a page that previously contained data becomes a free-list leaf
493 ** page.
494 **
495 ** The BtShared.pHasContent bitvec exists to work around an obscure
496 ** bug caused by the interaction of two useful IO optimizations surrounding
497 ** free-list leaf pages:
498 **
499 ** 1) When all data is deleted from a page and the page becomes
500 ** a free-list leaf page, the page is not written to the database
501 ** (as free-list leaf pages contain no meaningful data). Sometimes
502 ** such a page is not even journalled (as it will not be modified,
503 ** why bother journalling it?).
504 **
505 ** 2) When a free-list leaf page is reused, its content is not read
506 ** from the database or written to the journal file (why should it
507 ** be, if it is not at all meaningful?).
508 **
509 ** By themselves, these optimizations work fine and provide a handy
510 ** performance boost to bulk delete or insert operations. However, if
511 ** a page is moved to the free-list and then reused within the same
512 ** transaction, a problem comes up. If the page is not journalled when
513 ** it is moved to the free-list and it is also not journalled when it
514 ** is extracted from the free-list and reused, then the original data
515 ** may be lost. In the event of a rollback, it may not be possible
516 ** to restore the database to its original configuration.
517 **
518 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
519 ** moved to become a free-list leaf page, the corresponding bit is
520 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
521 ** optimization 2 above is omitted if the corresponding bit is already
522 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
523 ** at the end of every transaction.
524 */
btreeSetHasContent(BtShared * pBt,Pgno pgno)525 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
526 int rc = SQLITE_OK;
527 if( !pBt->pHasContent ){
528 assert( pgno<=pBt->nPage );
529 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
530 if( !pBt->pHasContent ){
531 rc = SQLITE_NOMEM;
532 }
533 }
534 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
535 rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
536 }
537 return rc;
538 }
539
540 /*
541 ** Query the BtShared.pHasContent vector.
542 **
543 ** This function is called when a free-list leaf page is removed from the
544 ** free-list for reuse. It returns false if it is safe to retrieve the
545 ** page from the pager layer with the 'no-content' flag set. True otherwise.
546 */
btreeGetHasContent(BtShared * pBt,Pgno pgno)547 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
548 Bitvec *p = pBt->pHasContent;
549 return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
550 }
551
552 /*
553 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
554 ** invoked at the conclusion of each write-transaction.
555 */
btreeClearHasContent(BtShared * pBt)556 static void btreeClearHasContent(BtShared *pBt){
557 sqlite3BitvecDestroy(pBt->pHasContent);
558 pBt->pHasContent = 0;
559 }
560
561 /*
562 ** Save the current cursor position in the variables BtCursor.nKey
563 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
564 **
565 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
566 ** prior to calling this routine.
567 */
saveCursorPosition(BtCursor * pCur)568 static int saveCursorPosition(BtCursor *pCur){
569 int rc;
570
571 assert( CURSOR_VALID==pCur->eState );
572 assert( 0==pCur->pKey );
573 assert( cursorHoldsMutex(pCur) );
574
575 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
576 assert( rc==SQLITE_OK ); /* KeySize() cannot fail */
577
578 /* If this is an intKey table, then the above call to BtreeKeySize()
579 ** stores the integer key in pCur->nKey. In this case this value is
580 ** all that is required. Otherwise, if pCur is not open on an intKey
581 ** table, then malloc space for and store the pCur->nKey bytes of key
582 ** data.
583 */
584 if( 0==pCur->apPage[0]->intKey ){
585 void *pKey = sqlite3Malloc( (int)pCur->nKey );
586 if( pKey ){
587 rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
588 if( rc==SQLITE_OK ){
589 pCur->pKey = pKey;
590 }else{
591 sqlite3_free(pKey);
592 }
593 }else{
594 rc = SQLITE_NOMEM;
595 }
596 }
597 assert( !pCur->apPage[0]->intKey || !pCur->pKey );
598
599 if( rc==SQLITE_OK ){
600 int i;
601 for(i=0; i<=pCur->iPage; i++){
602 releasePage(pCur->apPage[i]);
603 pCur->apPage[i] = 0;
604 }
605 pCur->iPage = -1;
606 pCur->eState = CURSOR_REQUIRESEEK;
607 }
608
609 invalidateOverflowCache(pCur);
610 return rc;
611 }
612
613 /*
614 ** Save the positions of all cursors (except pExcept) that are open on
615 ** the table with root-page iRoot. Usually, this is called just before cursor
616 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
617 */
saveAllCursors(BtShared * pBt,Pgno iRoot,BtCursor * pExcept)618 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
619 BtCursor *p;
620 assert( sqlite3_mutex_held(pBt->mutex) );
621 assert( pExcept==0 || pExcept->pBt==pBt );
622 for(p=pBt->pCursor; p; p=p->pNext){
623 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
624 p->eState==CURSOR_VALID ){
625 int rc = saveCursorPosition(p);
626 if( SQLITE_OK!=rc ){
627 return rc;
628 }
629 }
630 }
631 return SQLITE_OK;
632 }
633
634 /*
635 ** Clear the current cursor position.
636 */
sqlite3BtreeClearCursor(BtCursor * pCur)637 void sqlite3BtreeClearCursor(BtCursor *pCur){
638 assert( cursorHoldsMutex(pCur) );
639 sqlite3_free(pCur->pKey);
640 pCur->pKey = 0;
641 pCur->eState = CURSOR_INVALID;
642 }
643
644 /*
645 ** In this version of BtreeMoveto, pKey is a packed index record
646 ** such as is generated by the OP_MakeRecord opcode. Unpack the
647 ** record and then call BtreeMovetoUnpacked() to do the work.
648 */
btreeMoveto(BtCursor * pCur,const void * pKey,i64 nKey,int bias,int * pRes)649 static int btreeMoveto(
650 BtCursor *pCur, /* Cursor open on the btree to be searched */
651 const void *pKey, /* Packed key if the btree is an index */
652 i64 nKey, /* Integer key for tables. Size of pKey for indices */
653 int bias, /* Bias search to the high end */
654 int *pRes /* Write search results here */
655 ){
656 int rc; /* Status code */
657 UnpackedRecord *pIdxKey; /* Unpacked index key */
658 char aSpace[150]; /* Temp space for pIdxKey - to avoid a malloc */
659
660 if( pKey ){
661 assert( nKey==(i64)(int)nKey );
662 pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey,
663 aSpace, sizeof(aSpace));
664 if( pIdxKey==0 ) return SQLITE_NOMEM;
665 }else{
666 pIdxKey = 0;
667 }
668 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
669 if( pKey ){
670 sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
671 }
672 return rc;
673 }
674
675 /*
676 ** Restore the cursor to the position it was in (or as close to as possible)
677 ** when saveCursorPosition() was called. Note that this call deletes the
678 ** saved position info stored by saveCursorPosition(), so there can be
679 ** at most one effective restoreCursorPosition() call after each
680 ** saveCursorPosition().
681 */
btreeRestoreCursorPosition(BtCursor * pCur)682 static int btreeRestoreCursorPosition(BtCursor *pCur){
683 int rc;
684 assert( cursorHoldsMutex(pCur) );
685 assert( pCur->eState>=CURSOR_REQUIRESEEK );
686 if( pCur->eState==CURSOR_FAULT ){
687 return pCur->skipNext;
688 }
689 pCur->eState = CURSOR_INVALID;
690 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext);
691 if( rc==SQLITE_OK ){
692 sqlite3_free(pCur->pKey);
693 pCur->pKey = 0;
694 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
695 }
696 return rc;
697 }
698
699 #define restoreCursorPosition(p) \
700 (p->eState>=CURSOR_REQUIRESEEK ? \
701 btreeRestoreCursorPosition(p) : \
702 SQLITE_OK)
703
704 /*
705 ** Determine whether or not a cursor has moved from the position it
706 ** was last placed at. Cursors can move when the row they are pointing
707 ** at is deleted out from under them.
708 **
709 ** This routine returns an error code if something goes wrong. The
710 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
711 */
sqlite3BtreeCursorHasMoved(BtCursor * pCur,int * pHasMoved)712 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
713 int rc;
714
715 rc = restoreCursorPosition(pCur);
716 if( rc ){
717 *pHasMoved = 1;
718 return rc;
719 }
720 if( pCur->eState!=CURSOR_VALID || pCur->skipNext!=0 ){
721 *pHasMoved = 1;
722 }else{
723 *pHasMoved = 0;
724 }
725 return SQLITE_OK;
726 }
727
728 #ifndef SQLITE_OMIT_AUTOVACUUM
729 /*
730 ** Given a page number of a regular database page, return the page
731 ** number for the pointer-map page that contains the entry for the
732 ** input page number.
733 **
734 ** Return 0 (not a valid page) for pgno==1 since there is
735 ** no pointer map associated with page 1. The integrity_check logic
736 ** requires that ptrmapPageno(*,1)!=1.
737 */
ptrmapPageno(BtShared * pBt,Pgno pgno)738 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
739 int nPagesPerMapPage;
740 Pgno iPtrMap, ret;
741 assert( sqlite3_mutex_held(pBt->mutex) );
742 if( pgno<2 ) return 0;
743 nPagesPerMapPage = (pBt->usableSize/5)+1;
744 iPtrMap = (pgno-2)/nPagesPerMapPage;
745 ret = (iPtrMap*nPagesPerMapPage) + 2;
746 if( ret==PENDING_BYTE_PAGE(pBt) ){
747 ret++;
748 }
749 return ret;
750 }
751
752 /*
753 ** Write an entry into the pointer map.
754 **
755 ** This routine updates the pointer map entry for page number 'key'
756 ** so that it maps to type 'eType' and parent page number 'pgno'.
757 **
758 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
759 ** a no-op. If an error occurs, the appropriate error code is written
760 ** into *pRC.
761 */
ptrmapPut(BtShared * pBt,Pgno key,u8 eType,Pgno parent,int * pRC)762 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
763 DbPage *pDbPage; /* The pointer map page */
764 u8 *pPtrmap; /* The pointer map data */
765 Pgno iPtrmap; /* The pointer map page number */
766 int offset; /* Offset in pointer map page */
767 int rc; /* Return code from subfunctions */
768
769 if( *pRC ) return;
770
771 assert( sqlite3_mutex_held(pBt->mutex) );
772 /* The master-journal page number must never be used as a pointer map page */
773 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
774
775 assert( pBt->autoVacuum );
776 if( key==0 ){
777 *pRC = SQLITE_CORRUPT_BKPT;
778 return;
779 }
780 iPtrmap = PTRMAP_PAGENO(pBt, key);
781 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
782 if( rc!=SQLITE_OK ){
783 *pRC = rc;
784 return;
785 }
786 offset = PTRMAP_PTROFFSET(iPtrmap, key);
787 if( offset<0 ){
788 *pRC = SQLITE_CORRUPT_BKPT;
789 goto ptrmap_exit;
790 }
791 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
792
793 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
794 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
795 *pRC= rc = sqlite3PagerWrite(pDbPage);
796 if( rc==SQLITE_OK ){
797 pPtrmap[offset] = eType;
798 put4byte(&pPtrmap[offset+1], parent);
799 }
800 }
801
802 ptrmap_exit:
803 sqlite3PagerUnref(pDbPage);
804 }
805
806 /*
807 ** Read an entry from the pointer map.
808 **
809 ** This routine retrieves the pointer map entry for page 'key', writing
810 ** the type and parent page number to *pEType and *pPgno respectively.
811 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
812 */
ptrmapGet(BtShared * pBt,Pgno key,u8 * pEType,Pgno * pPgno)813 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
814 DbPage *pDbPage; /* The pointer map page */
815 int iPtrmap; /* Pointer map page index */
816 u8 *pPtrmap; /* Pointer map page data */
817 int offset; /* Offset of entry in pointer map */
818 int rc;
819
820 assert( sqlite3_mutex_held(pBt->mutex) );
821
822 iPtrmap = PTRMAP_PAGENO(pBt, key);
823 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
824 if( rc!=0 ){
825 return rc;
826 }
827 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
828
829 offset = PTRMAP_PTROFFSET(iPtrmap, key);
830 assert( pEType!=0 );
831 *pEType = pPtrmap[offset];
832 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
833
834 sqlite3PagerUnref(pDbPage);
835 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
836 return SQLITE_OK;
837 }
838
839 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
840 #define ptrmapPut(w,x,y,z,rc)
841 #define ptrmapGet(w,x,y,z) SQLITE_OK
842 #define ptrmapPutOvflPtr(x, y, rc)
843 #endif
844
845 /*
846 ** Given a btree page and a cell index (0 means the first cell on
847 ** the page, 1 means the second cell, and so forth) return a pointer
848 ** to the cell content.
849 **
850 ** This routine works only for pages that do not contain overflow cells.
851 */
852 #define findCell(P,I) \
853 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
854
855 /*
856 ** This a more complex version of findCell() that works for
857 ** pages that do contain overflow cells.
858 */
findOverflowCell(MemPage * pPage,int iCell)859 static u8 *findOverflowCell(MemPage *pPage, int iCell){
860 int i;
861 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
862 for(i=pPage->nOverflow-1; i>=0; i--){
863 int k;
864 struct _OvflCell *pOvfl;
865 pOvfl = &pPage->aOvfl[i];
866 k = pOvfl->idx;
867 if( k<=iCell ){
868 if( k==iCell ){
869 return pOvfl->pCell;
870 }
871 iCell--;
872 }
873 }
874 return findCell(pPage, iCell);
875 }
876
877 /*
878 ** Parse a cell content block and fill in the CellInfo structure. There
879 ** are two versions of this function. btreeParseCell() takes a
880 ** cell index as the second argument and btreeParseCellPtr()
881 ** takes a pointer to the body of the cell as its second argument.
882 **
883 ** Within this file, the parseCell() macro can be called instead of
884 ** btreeParseCellPtr(). Using some compilers, this will be faster.
885 */
btreeParseCellPtr(MemPage * pPage,u8 * pCell,CellInfo * pInfo)886 static void btreeParseCellPtr(
887 MemPage *pPage, /* Page containing the cell */
888 u8 *pCell, /* Pointer to the cell text. */
889 CellInfo *pInfo /* Fill in this structure */
890 ){
891 u16 n; /* Number bytes in cell content header */
892 u32 nPayload; /* Number of bytes of cell payload */
893
894 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
895
896 pInfo->pCell = pCell;
897 assert( pPage->leaf==0 || pPage->leaf==1 );
898 n = pPage->childPtrSize;
899 assert( n==4-4*pPage->leaf );
900 if( pPage->intKey ){
901 if( pPage->hasData ){
902 n += getVarint32(&pCell[n], nPayload);
903 }else{
904 nPayload = 0;
905 }
906 n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
907 pInfo->nData = nPayload;
908 }else{
909 pInfo->nData = 0;
910 n += getVarint32(&pCell[n], nPayload);
911 pInfo->nKey = nPayload;
912 }
913 pInfo->nPayload = nPayload;
914 pInfo->nHeader = n;
915 testcase( nPayload==pPage->maxLocal );
916 testcase( nPayload==pPage->maxLocal+1 );
917 if( likely(nPayload<=pPage->maxLocal) ){
918 /* This is the (easy) common case where the entire payload fits
919 ** on the local page. No overflow is required.
920 */
921 if( (pInfo->nSize = (u16)(n+nPayload))<4 ) pInfo->nSize = 4;
922 pInfo->nLocal = (u16)nPayload;
923 pInfo->iOverflow = 0;
924 }else{
925 /* If the payload will not fit completely on the local page, we have
926 ** to decide how much to store locally and how much to spill onto
927 ** overflow pages. The strategy is to minimize the amount of unused
928 ** space on overflow pages while keeping the amount of local storage
929 ** in between minLocal and maxLocal.
930 **
931 ** Warning: changing the way overflow payload is distributed in any
932 ** way will result in an incompatible file format.
933 */
934 int minLocal; /* Minimum amount of payload held locally */
935 int maxLocal; /* Maximum amount of payload held locally */
936 int surplus; /* Overflow payload available for local storage */
937
938 minLocal = pPage->minLocal;
939 maxLocal = pPage->maxLocal;
940 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
941 testcase( surplus==maxLocal );
942 testcase( surplus==maxLocal+1 );
943 if( surplus <= maxLocal ){
944 pInfo->nLocal = (u16)surplus;
945 }else{
946 pInfo->nLocal = (u16)minLocal;
947 }
948 pInfo->iOverflow = (u16)(pInfo->nLocal + n);
949 pInfo->nSize = pInfo->iOverflow + 4;
950 }
951 }
952 #define parseCell(pPage, iCell, pInfo) \
953 btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
btreeParseCell(MemPage * pPage,int iCell,CellInfo * pInfo)954 static void btreeParseCell(
955 MemPage *pPage, /* Page containing the cell */
956 int iCell, /* The cell index. First cell is 0 */
957 CellInfo *pInfo /* Fill in this structure */
958 ){
959 parseCell(pPage, iCell, pInfo);
960 }
961
962 /*
963 ** Compute the total number of bytes that a Cell needs in the cell
964 ** data area of the btree-page. The return number includes the cell
965 ** data header and the local payload, but not any overflow page or
966 ** the space used by the cell pointer.
967 */
cellSizePtr(MemPage * pPage,u8 * pCell)968 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
969 u8 *pIter = &pCell[pPage->childPtrSize];
970 u32 nSize;
971
972 #ifdef SQLITE_DEBUG
973 /* The value returned by this function should always be the same as
974 ** the (CellInfo.nSize) value found by doing a full parse of the
975 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
976 ** this function verifies that this invariant is not violated. */
977 CellInfo debuginfo;
978 btreeParseCellPtr(pPage, pCell, &debuginfo);
979 #endif
980
981 if( pPage->intKey ){
982 u8 *pEnd;
983 if( pPage->hasData ){
984 pIter += getVarint32(pIter, nSize);
985 }else{
986 nSize = 0;
987 }
988
989 /* pIter now points at the 64-bit integer key value, a variable length
990 ** integer. The following block moves pIter to point at the first byte
991 ** past the end of the key value. */
992 pEnd = &pIter[9];
993 while( (*pIter++)&0x80 && pIter<pEnd );
994 }else{
995 pIter += getVarint32(pIter, nSize);
996 }
997
998 testcase( nSize==pPage->maxLocal );
999 testcase( nSize==pPage->maxLocal+1 );
1000 if( nSize>pPage->maxLocal ){
1001 int minLocal = pPage->minLocal;
1002 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1003 testcase( nSize==pPage->maxLocal );
1004 testcase( nSize==pPage->maxLocal+1 );
1005 if( nSize>pPage->maxLocal ){
1006 nSize = minLocal;
1007 }
1008 nSize += 4;
1009 }
1010 nSize += (u32)(pIter - pCell);
1011
1012 /* The minimum size of any cell is 4 bytes. */
1013 if( nSize<4 ){
1014 nSize = 4;
1015 }
1016
1017 assert( nSize==debuginfo.nSize );
1018 return (u16)nSize;
1019 }
1020
1021 #ifdef SQLITE_DEBUG
1022 /* This variation on cellSizePtr() is used inside of assert() statements
1023 ** only. */
cellSize(MemPage * pPage,int iCell)1024 static u16 cellSize(MemPage *pPage, int iCell){
1025 return cellSizePtr(pPage, findCell(pPage, iCell));
1026 }
1027 #endif
1028
1029 #ifndef SQLITE_OMIT_AUTOVACUUM
1030 /*
1031 ** If the cell pCell, part of page pPage contains a pointer
1032 ** to an overflow page, insert an entry into the pointer-map
1033 ** for the overflow page.
1034 */
ptrmapPutOvflPtr(MemPage * pPage,u8 * pCell,int * pRC)1035 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
1036 CellInfo info;
1037 if( *pRC ) return;
1038 assert( pCell!=0 );
1039 btreeParseCellPtr(pPage, pCell, &info);
1040 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
1041 if( info.iOverflow ){
1042 Pgno ovfl = get4byte(&pCell[info.iOverflow]);
1043 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1044 }
1045 }
1046 #endif
1047
1048
1049 /*
1050 ** Defragment the page given. All Cells are moved to the
1051 ** end of the page and all free space is collected into one
1052 ** big FreeBlk that occurs in between the header and cell
1053 ** pointer array and the cell content area.
1054 */
defragmentPage(MemPage * pPage)1055 static int defragmentPage(MemPage *pPage){
1056 int i; /* Loop counter */
1057 int pc; /* Address of a i-th cell */
1058 int hdr; /* Offset to the page header */
1059 int size; /* Size of a cell */
1060 int usableSize; /* Number of usable bytes on a page */
1061 int cellOffset; /* Offset to the cell pointer array */
1062 int cbrk; /* Offset to the cell content area */
1063 int nCell; /* Number of cells on the page */
1064 unsigned char *data; /* The page data */
1065 unsigned char *temp; /* Temp area for cell content */
1066 int iCellFirst; /* First allowable cell index */
1067 int iCellLast; /* Last possible cell index */
1068
1069
1070 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1071 assert( pPage->pBt!=0 );
1072 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1073 assert( pPage->nOverflow==0 );
1074 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1075 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1076 data = pPage->aData;
1077 hdr = pPage->hdrOffset;
1078 cellOffset = pPage->cellOffset;
1079 nCell = pPage->nCell;
1080 assert( nCell==get2byte(&data[hdr+3]) );
1081 usableSize = pPage->pBt->usableSize;
1082 cbrk = get2byte(&data[hdr+5]);
1083 memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
1084 cbrk = usableSize;
1085 iCellFirst = cellOffset + 2*nCell;
1086 iCellLast = usableSize - 4;
1087 for(i=0; i<nCell; i++){
1088 u8 *pAddr; /* The i-th cell pointer */
1089 pAddr = &data[cellOffset + i*2];
1090 pc = get2byte(pAddr);
1091 testcase( pc==iCellFirst );
1092 testcase( pc==iCellLast );
1093 #if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1094 /* These conditions have already been verified in btreeInitPage()
1095 ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined
1096 */
1097 if( pc<iCellFirst || pc>iCellLast ){
1098 return SQLITE_CORRUPT_BKPT;
1099 }
1100 #endif
1101 assert( pc>=iCellFirst && pc<=iCellLast );
1102 size = cellSizePtr(pPage, &temp[pc]);
1103 cbrk -= size;
1104 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1105 if( cbrk<iCellFirst ){
1106 return SQLITE_CORRUPT_BKPT;
1107 }
1108 #else
1109 if( cbrk<iCellFirst || pc+size>usableSize ){
1110 return SQLITE_CORRUPT_BKPT;
1111 }
1112 #endif
1113 assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1114 testcase( cbrk+size==usableSize );
1115 testcase( pc+size==usableSize );
1116 memcpy(&data[cbrk], &temp[pc], size);
1117 put2byte(pAddr, cbrk);
1118 }
1119 assert( cbrk>=iCellFirst );
1120 put2byte(&data[hdr+5], cbrk);
1121 data[hdr+1] = 0;
1122 data[hdr+2] = 0;
1123 data[hdr+7] = 0;
1124 memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1125 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1126 if( cbrk-iCellFirst!=pPage->nFree ){
1127 return SQLITE_CORRUPT_BKPT;
1128 }
1129 return SQLITE_OK;
1130 }
1131
1132 /*
1133 ** Allocate nByte bytes of space from within the B-Tree page passed
1134 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1135 ** of the first byte of allocated space. Return either SQLITE_OK or
1136 ** an error code (usually SQLITE_CORRUPT).
1137 **
1138 ** The caller guarantees that there is sufficient space to make the
1139 ** allocation. This routine might need to defragment in order to bring
1140 ** all the space together, however. This routine will avoid using
1141 ** the first two bytes past the cell pointer area since presumably this
1142 ** allocation is being made in order to insert a new cell, so we will
1143 ** also end up needing a new cell pointer.
1144 */
allocateSpace(MemPage * pPage,int nByte,int * pIdx)1145 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1146 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */
1147 u8 * const data = pPage->aData; /* Local cache of pPage->aData */
1148 int nFrag; /* Number of fragmented bytes on pPage */
1149 int top; /* First byte of cell content area */
1150 int gap; /* First byte of gap between cell pointers and cell content */
1151 int rc; /* Integer return code */
1152 int usableSize; /* Usable size of the page */
1153
1154 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1155 assert( pPage->pBt );
1156 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1157 assert( nByte>=0 ); /* Minimum cell size is 4 */
1158 assert( pPage->nFree>=nByte );
1159 assert( pPage->nOverflow==0 );
1160 usableSize = pPage->pBt->usableSize;
1161 assert( nByte < usableSize-8 );
1162
1163 nFrag = data[hdr+7];
1164 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1165 gap = pPage->cellOffset + 2*pPage->nCell;
1166 top = get2byteNotZero(&data[hdr+5]);
1167 if( gap>top ) return SQLITE_CORRUPT_BKPT;
1168 testcase( gap+2==top );
1169 testcase( gap+1==top );
1170 testcase( gap==top );
1171
1172 if( nFrag>=60 ){
1173 /* Always defragment highly fragmented pages */
1174 rc = defragmentPage(pPage);
1175 if( rc ) return rc;
1176 top = get2byteNotZero(&data[hdr+5]);
1177 }else if( gap+2<=top ){
1178 /* Search the freelist looking for a free slot big enough to satisfy
1179 ** the request. The allocation is made from the first free slot in
1180 ** the list that is large enough to accomadate it.
1181 */
1182 int pc, addr;
1183 for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){
1184 int size; /* Size of the free slot */
1185 if( pc>usableSize-4 || pc<addr+4 ){
1186 return SQLITE_CORRUPT_BKPT;
1187 }
1188 size = get2byte(&data[pc+2]);
1189 if( size>=nByte ){
1190 int x = size - nByte;
1191 testcase( x==4 );
1192 testcase( x==3 );
1193 if( x<4 ){
1194 /* Remove the slot from the free-list. Update the number of
1195 ** fragmented bytes within the page. */
1196 memcpy(&data[addr], &data[pc], 2);
1197 data[hdr+7] = (u8)(nFrag + x);
1198 }else if( size+pc > usableSize ){
1199 return SQLITE_CORRUPT_BKPT;
1200 }else{
1201 /* The slot remains on the free-list. Reduce its size to account
1202 ** for the portion used by the new allocation. */
1203 put2byte(&data[pc+2], x);
1204 }
1205 *pIdx = pc + x;
1206 return SQLITE_OK;
1207 }
1208 }
1209 }
1210
1211 /* Check to make sure there is enough space in the gap to satisfy
1212 ** the allocation. If not, defragment.
1213 */
1214 testcase( gap+2+nByte==top );
1215 if( gap+2+nByte>top ){
1216 rc = defragmentPage(pPage);
1217 if( rc ) return rc;
1218 top = get2byteNotZero(&data[hdr+5]);
1219 assert( gap+nByte<=top );
1220 }
1221
1222
1223 /* Allocate memory from the gap in between the cell pointer array
1224 ** and the cell content area. The btreeInitPage() call has already
1225 ** validated the freelist. Given that the freelist is valid, there
1226 ** is no way that the allocation can extend off the end of the page.
1227 ** The assert() below verifies the previous sentence.
1228 */
1229 top -= nByte;
1230 put2byte(&data[hdr+5], top);
1231 assert( top+nByte <= (int)pPage->pBt->usableSize );
1232 *pIdx = top;
1233 return SQLITE_OK;
1234 }
1235
1236 /*
1237 ** Return a section of the pPage->aData to the freelist.
1238 ** The first byte of the new free block is pPage->aDisk[start]
1239 ** and the size of the block is "size" bytes.
1240 **
1241 ** Most of the effort here is involved in coalesing adjacent
1242 ** free blocks into a single big free block.
1243 */
freeSpace(MemPage * pPage,int start,int size)1244 static int freeSpace(MemPage *pPage, int start, int size){
1245 int addr, pbegin, hdr;
1246 int iLast; /* Largest possible freeblock offset */
1247 unsigned char *data = pPage->aData;
1248
1249 assert( pPage->pBt!=0 );
1250 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1251 assert( start>=pPage->hdrOffset+6+pPage->childPtrSize );
1252 assert( (start + size) <= (int)pPage->pBt->usableSize );
1253 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1254 assert( size>=0 ); /* Minimum cell size is 4 */
1255
1256 if( pPage->pBt->secureDelete ){
1257 /* Overwrite deleted information with zeros when the secure_delete
1258 ** option is enabled */
1259 memset(&data[start], 0, size);
1260 }
1261
1262 /* Add the space back into the linked list of freeblocks. Note that
1263 ** even though the freeblock list was checked by btreeInitPage(),
1264 ** btreeInitPage() did not detect overlapping cells or
1265 ** freeblocks that overlapped cells. Nor does it detect when the
1266 ** cell content area exceeds the value in the page header. If these
1267 ** situations arise, then subsequent insert operations might corrupt
1268 ** the freelist. So we do need to check for corruption while scanning
1269 ** the freelist.
1270 */
1271 hdr = pPage->hdrOffset;
1272 addr = hdr + 1;
1273 iLast = pPage->pBt->usableSize - 4;
1274 assert( start<=iLast );
1275 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
1276 if( pbegin<addr+4 ){
1277 return SQLITE_CORRUPT_BKPT;
1278 }
1279 addr = pbegin;
1280 }
1281 if( pbegin>iLast ){
1282 return SQLITE_CORRUPT_BKPT;
1283 }
1284 assert( pbegin>addr || pbegin==0 );
1285 put2byte(&data[addr], start);
1286 put2byte(&data[start], pbegin);
1287 put2byte(&data[start+2], size);
1288 pPage->nFree = pPage->nFree + (u16)size;
1289
1290 /* Coalesce adjacent free blocks */
1291 addr = hdr + 1;
1292 while( (pbegin = get2byte(&data[addr]))>0 ){
1293 int pnext, psize, x;
1294 assert( pbegin>addr );
1295 assert( pbegin <= (int)pPage->pBt->usableSize-4 );
1296 pnext = get2byte(&data[pbegin]);
1297 psize = get2byte(&data[pbegin+2]);
1298 if( pbegin + psize + 3 >= pnext && pnext>0 ){
1299 int frag = pnext - (pbegin+psize);
1300 if( (frag<0) || (frag>(int)data[hdr+7]) ){
1301 return SQLITE_CORRUPT_BKPT;
1302 }
1303 data[hdr+7] -= (u8)frag;
1304 x = get2byte(&data[pnext]);
1305 put2byte(&data[pbegin], x);
1306 x = pnext + get2byte(&data[pnext+2]) - pbegin;
1307 put2byte(&data[pbegin+2], x);
1308 }else{
1309 addr = pbegin;
1310 }
1311 }
1312
1313 /* If the cell content area begins with a freeblock, remove it. */
1314 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
1315 int top;
1316 pbegin = get2byte(&data[hdr+1]);
1317 memcpy(&data[hdr+1], &data[pbegin], 2);
1318 top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]);
1319 put2byte(&data[hdr+5], top);
1320 }
1321 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1322 return SQLITE_OK;
1323 }
1324
1325 /*
1326 ** Decode the flags byte (the first byte of the header) for a page
1327 ** and initialize fields of the MemPage structure accordingly.
1328 **
1329 ** Only the following combinations are supported. Anything different
1330 ** indicates a corrupt database files:
1331 **
1332 ** PTF_ZERODATA
1333 ** PTF_ZERODATA | PTF_LEAF
1334 ** PTF_LEAFDATA | PTF_INTKEY
1335 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1336 */
decodeFlags(MemPage * pPage,int flagByte)1337 static int decodeFlags(MemPage *pPage, int flagByte){
1338 BtShared *pBt; /* A copy of pPage->pBt */
1339
1340 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1341 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1342 pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 );
1343 flagByte &= ~PTF_LEAF;
1344 pPage->childPtrSize = 4-4*pPage->leaf;
1345 pBt = pPage->pBt;
1346 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1347 pPage->intKey = 1;
1348 pPage->hasData = pPage->leaf;
1349 pPage->maxLocal = pBt->maxLeaf;
1350 pPage->minLocal = pBt->minLeaf;
1351 }else if( flagByte==PTF_ZERODATA ){
1352 pPage->intKey = 0;
1353 pPage->hasData = 0;
1354 pPage->maxLocal = pBt->maxLocal;
1355 pPage->minLocal = pBt->minLocal;
1356 }else{
1357 return SQLITE_CORRUPT_BKPT;
1358 }
1359 return SQLITE_OK;
1360 }
1361
1362 /*
1363 ** Initialize the auxiliary information for a disk block.
1364 **
1365 ** Return SQLITE_OK on success. If we see that the page does
1366 ** not contain a well-formed database page, then return
1367 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
1368 ** guarantee that the page is well-formed. It only shows that
1369 ** we failed to detect any corruption.
1370 */
btreeInitPage(MemPage * pPage)1371 static int btreeInitPage(MemPage *pPage){
1372
1373 assert( pPage->pBt!=0 );
1374 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1375 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1376 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1377 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1378
1379 if( !pPage->isInit ){
1380 u16 pc; /* Address of a freeblock within pPage->aData[] */
1381 u8 hdr; /* Offset to beginning of page header */
1382 u8 *data; /* Equal to pPage->aData */
1383 BtShared *pBt; /* The main btree structure */
1384 int usableSize; /* Amount of usable space on each page */
1385 u16 cellOffset; /* Offset from start of page to first cell pointer */
1386 int nFree; /* Number of unused bytes on the page */
1387 int top; /* First byte of the cell content area */
1388 int iCellFirst; /* First allowable cell or freeblock offset */
1389 int iCellLast; /* Last possible cell or freeblock offset */
1390
1391 pBt = pPage->pBt;
1392
1393 hdr = pPage->hdrOffset;
1394 data = pPage->aData;
1395 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1396 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1397 pPage->maskPage = (u16)(pBt->pageSize - 1);
1398 pPage->nOverflow = 0;
1399 usableSize = pBt->usableSize;
1400 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1401 top = get2byteNotZero(&data[hdr+5]);
1402 pPage->nCell = get2byte(&data[hdr+3]);
1403 if( pPage->nCell>MX_CELL(pBt) ){
1404 /* To many cells for a single page. The page must be corrupt */
1405 return SQLITE_CORRUPT_BKPT;
1406 }
1407 testcase( pPage->nCell==MX_CELL(pBt) );
1408
1409 /* A malformed database page might cause us to read past the end
1410 ** of page when parsing a cell.
1411 **
1412 ** The following block of code checks early to see if a cell extends
1413 ** past the end of a page boundary and causes SQLITE_CORRUPT to be
1414 ** returned if it does.
1415 */
1416 iCellFirst = cellOffset + 2*pPage->nCell;
1417 iCellLast = usableSize - 4;
1418 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1419 {
1420 int i; /* Index into the cell pointer array */
1421 int sz; /* Size of a cell */
1422
1423 if( !pPage->leaf ) iCellLast--;
1424 for(i=0; i<pPage->nCell; i++){
1425 pc = get2byte(&data[cellOffset+i*2]);
1426 testcase( pc==iCellFirst );
1427 testcase( pc==iCellLast );
1428 if( pc<iCellFirst || pc>iCellLast ){
1429 return SQLITE_CORRUPT_BKPT;
1430 }
1431 sz = cellSizePtr(pPage, &data[pc]);
1432 testcase( pc+sz==usableSize );
1433 if( pc+sz>usableSize ){
1434 return SQLITE_CORRUPT_BKPT;
1435 }
1436 }
1437 if( !pPage->leaf ) iCellLast++;
1438 }
1439 #endif
1440
1441 /* Compute the total free space on the page */
1442 pc = get2byte(&data[hdr+1]);
1443 nFree = data[hdr+7] + top;
1444 while( pc>0 ){
1445 u16 next, size;
1446 if( pc<iCellFirst || pc>iCellLast ){
1447 /* Start of free block is off the page */
1448 return SQLITE_CORRUPT_BKPT;
1449 }
1450 next = get2byte(&data[pc]);
1451 size = get2byte(&data[pc+2]);
1452 if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){
1453 /* Free blocks must be in ascending order. And the last byte of
1454 ** the free-block must lie on the database page. */
1455 return SQLITE_CORRUPT_BKPT;
1456 }
1457 nFree = nFree + size;
1458 pc = next;
1459 }
1460
1461 /* At this point, nFree contains the sum of the offset to the start
1462 ** of the cell-content area plus the number of free bytes within
1463 ** the cell-content area. If this is greater than the usable-size
1464 ** of the page, then the page must be corrupted. This check also
1465 ** serves to verify that the offset to the start of the cell-content
1466 ** area, according to the page header, lies within the page.
1467 */
1468 if( nFree>usableSize ){
1469 return SQLITE_CORRUPT_BKPT;
1470 }
1471 pPage->nFree = (u16)(nFree - iCellFirst);
1472 pPage->isInit = 1;
1473 }
1474 return SQLITE_OK;
1475 }
1476
1477 /*
1478 ** Set up a raw page so that it looks like a database page holding
1479 ** no entries.
1480 */
zeroPage(MemPage * pPage,int flags)1481 static void zeroPage(MemPage *pPage, int flags){
1482 unsigned char *data = pPage->aData;
1483 BtShared *pBt = pPage->pBt;
1484 u8 hdr = pPage->hdrOffset;
1485 u16 first;
1486
1487 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1488 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1489 assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1490 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1491 assert( sqlite3_mutex_held(pBt->mutex) );
1492 if( pBt->secureDelete ){
1493 memset(&data[hdr], 0, pBt->usableSize - hdr);
1494 }
1495 data[hdr] = (char)flags;
1496 first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0);
1497 memset(&data[hdr+1], 0, 4);
1498 data[hdr+7] = 0;
1499 put2byte(&data[hdr+5], pBt->usableSize);
1500 pPage->nFree = (u16)(pBt->usableSize - first);
1501 decodeFlags(pPage, flags);
1502 pPage->hdrOffset = hdr;
1503 pPage->cellOffset = first;
1504 pPage->nOverflow = 0;
1505 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
1506 pPage->maskPage = (u16)(pBt->pageSize - 1);
1507 pPage->nCell = 0;
1508 pPage->isInit = 1;
1509 }
1510
1511
1512 /*
1513 ** Convert a DbPage obtained from the pager into a MemPage used by
1514 ** the btree layer.
1515 */
btreePageFromDbPage(DbPage * pDbPage,Pgno pgno,BtShared * pBt)1516 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1517 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1518 pPage->aData = sqlite3PagerGetData(pDbPage);
1519 pPage->pDbPage = pDbPage;
1520 pPage->pBt = pBt;
1521 pPage->pgno = pgno;
1522 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1523 return pPage;
1524 }
1525
1526 /*
1527 ** Get a page from the pager. Initialize the MemPage.pBt and
1528 ** MemPage.aData elements if needed.
1529 **
1530 ** If the noContent flag is set, it means that we do not care about
1531 ** the content of the page at this time. So do not go to the disk
1532 ** to fetch the content. Just fill in the content with zeros for now.
1533 ** If in the future we call sqlite3PagerWrite() on this page, that
1534 ** means we have started to be concerned about content and the disk
1535 ** read should occur at that point.
1536 */
btreeGetPage(BtShared * pBt,Pgno pgno,MemPage ** ppPage,int noContent)1537 static int btreeGetPage(
1538 BtShared *pBt, /* The btree */
1539 Pgno pgno, /* Number of the page to fetch */
1540 MemPage **ppPage, /* Return the page in this parameter */
1541 int noContent /* Do not load page content if true */
1542 ){
1543 int rc;
1544 DbPage *pDbPage;
1545
1546 assert( sqlite3_mutex_held(pBt->mutex) );
1547 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1548 if( rc ) return rc;
1549 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1550 return SQLITE_OK;
1551 }
1552
1553 /*
1554 ** Retrieve a page from the pager cache. If the requested page is not
1555 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
1556 ** MemPage.aData elements if needed.
1557 */
btreePageLookup(BtShared * pBt,Pgno pgno)1558 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
1559 DbPage *pDbPage;
1560 assert( sqlite3_mutex_held(pBt->mutex) );
1561 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1562 if( pDbPage ){
1563 return btreePageFromDbPage(pDbPage, pgno, pBt);
1564 }
1565 return 0;
1566 }
1567
1568 /*
1569 ** Return the size of the database file in pages. If there is any kind of
1570 ** error, return ((unsigned int)-1).
1571 */
btreePagecount(BtShared * pBt)1572 static Pgno btreePagecount(BtShared *pBt){
1573 return pBt->nPage;
1574 }
sqlite3BtreeLastPage(Btree * p)1575 u32 sqlite3BtreeLastPage(Btree *p){
1576 assert( sqlite3BtreeHoldsMutex(p) );
1577 assert( ((p->pBt->nPage)&0x8000000)==0 );
1578 return (int)btreePagecount(p->pBt);
1579 }
1580
1581 /*
1582 ** Get a page from the pager and initialize it. This routine is just a
1583 ** convenience wrapper around separate calls to btreeGetPage() and
1584 ** btreeInitPage().
1585 **
1586 ** If an error occurs, then the value *ppPage is set to is undefined. It
1587 ** may remain unchanged, or it may be set to an invalid value.
1588 */
getAndInitPage(BtShared * pBt,Pgno pgno,MemPage ** ppPage)1589 static int getAndInitPage(
1590 BtShared *pBt, /* The database file */
1591 Pgno pgno, /* Number of the page to get */
1592 MemPage **ppPage /* Write the page pointer here */
1593 ){
1594 int rc;
1595 assert( sqlite3_mutex_held(pBt->mutex) );
1596
1597 if( pgno>btreePagecount(pBt) ){
1598 rc = SQLITE_CORRUPT_BKPT;
1599 }else{
1600 rc = btreeGetPage(pBt, pgno, ppPage, 0);
1601 if( rc==SQLITE_OK ){
1602 rc = btreeInitPage(*ppPage);
1603 if( rc!=SQLITE_OK ){
1604 releasePage(*ppPage);
1605 }
1606 }
1607 }
1608
1609 testcase( pgno==0 );
1610 assert( pgno!=0 || rc==SQLITE_CORRUPT );
1611 return rc;
1612 }
1613
1614 /*
1615 ** Release a MemPage. This should be called once for each prior
1616 ** call to btreeGetPage.
1617 */
releasePage(MemPage * pPage)1618 static void releasePage(MemPage *pPage){
1619 if( pPage ){
1620 assert( pPage->aData );
1621 assert( pPage->pBt );
1622 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1623 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1624 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1625 sqlite3PagerUnref(pPage->pDbPage);
1626 }
1627 }
1628
1629 /*
1630 ** During a rollback, when the pager reloads information into the cache
1631 ** so that the cache is restored to its original state at the start of
1632 ** the transaction, for each page restored this routine is called.
1633 **
1634 ** This routine needs to reset the extra data section at the end of the
1635 ** page to agree with the restored data.
1636 */
pageReinit(DbPage * pData)1637 static void pageReinit(DbPage *pData){
1638 MemPage *pPage;
1639 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1640 assert( sqlite3PagerPageRefcount(pData)>0 );
1641 if( pPage->isInit ){
1642 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1643 pPage->isInit = 0;
1644 if( sqlite3PagerPageRefcount(pData)>1 ){
1645 /* pPage might not be a btree page; it might be an overflow page
1646 ** or ptrmap page or a free page. In those cases, the following
1647 ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
1648 ** But no harm is done by this. And it is very important that
1649 ** btreeInitPage() be called on every btree page so we make
1650 ** the call for every page that comes in for re-initing. */
1651 btreeInitPage(pPage);
1652 }
1653 }
1654 }
1655
1656 /*
1657 ** Invoke the busy handler for a btree.
1658 */
btreeInvokeBusyHandler(void * pArg)1659 static int btreeInvokeBusyHandler(void *pArg){
1660 BtShared *pBt = (BtShared*)pArg;
1661 assert( pBt->db );
1662 assert( sqlite3_mutex_held(pBt->db->mutex) );
1663 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1664 }
1665
1666 /*
1667 ** Open a database file.
1668 **
1669 ** zFilename is the name of the database file. If zFilename is NULL
1670 ** then an ephemeral database is created. The ephemeral database might
1671 ** be exclusively in memory, or it might use a disk-based memory cache.
1672 ** Either way, the ephemeral database will be automatically deleted
1673 ** when sqlite3BtreeClose() is called.
1674 **
1675 ** If zFilename is ":memory:" then an in-memory database is created
1676 ** that is automatically destroyed when it is closed.
1677 **
1678 ** The "flags" parameter is a bitmask that might contain bits
1679 ** BTREE_OMIT_JOURNAL and/or BTREE_NO_READLOCK. The BTREE_NO_READLOCK
1680 ** bit is also set if the SQLITE_NoReadlock flags is set in db->flags.
1681 ** These flags are passed through into sqlite3PagerOpen() and must
1682 ** be the same values as PAGER_OMIT_JOURNAL and PAGER_NO_READLOCK.
1683 **
1684 ** If the database is already opened in the same database connection
1685 ** and we are in shared cache mode, then the open will fail with an
1686 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared
1687 ** objects in the same database connection since doing so will lead
1688 ** to problems with locking.
1689 */
sqlite3BtreeOpen(const char * zFilename,sqlite3 * db,Btree ** ppBtree,int flags,int vfsFlags)1690 int sqlite3BtreeOpen(
1691 const char *zFilename, /* Name of the file containing the BTree database */
1692 sqlite3 *db, /* Associated database handle */
1693 Btree **ppBtree, /* Pointer to new Btree object written here */
1694 int flags, /* Options */
1695 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
1696 ){
1697 sqlite3_vfs *pVfs; /* The VFS to use for this btree */
1698 BtShared *pBt = 0; /* Shared part of btree structure */
1699 Btree *p; /* Handle to return */
1700 sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */
1701 int rc = SQLITE_OK; /* Result code from this function */
1702 u8 nReserve; /* Byte of unused space on each page */
1703 unsigned char zDbHeader[100]; /* Database header content */
1704
1705 /* True if opening an ephemeral, temporary database */
1706 const int isTempDb = zFilename==0 || zFilename[0]==0;
1707
1708 /* Set the variable isMemdb to true for an in-memory database, or
1709 ** false for a file-based database.
1710 */
1711 #ifdef SQLITE_OMIT_MEMORYDB
1712 const int isMemdb = 0;
1713 #else
1714 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
1715 || (isTempDb && sqlite3TempInMemory(db));
1716 #endif
1717
1718 assert( db!=0 );
1719 assert( sqlite3_mutex_held(db->mutex) );
1720 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */
1721
1722 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
1723 assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
1724
1725 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
1726 assert( (flags & BTREE_SINGLE)==0 || isTempDb );
1727
1728 if( db->flags & SQLITE_NoReadlock ){
1729 flags |= BTREE_NO_READLOCK;
1730 }
1731 if( isMemdb ){
1732 flags |= BTREE_MEMORY;
1733 }
1734 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
1735 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
1736 }
1737 pVfs = db->pVfs;
1738 p = sqlite3MallocZero(sizeof(Btree));
1739 if( !p ){
1740 return SQLITE_NOMEM;
1741 }
1742 p->inTrans = TRANS_NONE;
1743 p->db = db;
1744 #ifndef SQLITE_OMIT_SHARED_CACHE
1745 p->lock.pBtree = p;
1746 p->lock.iTable = 1;
1747 #endif
1748
1749 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1750 /*
1751 ** If this Btree is a candidate for shared cache, try to find an
1752 ** existing BtShared object that we can share with
1753 */
1754 if( isMemdb==0 && isTempDb==0 ){
1755 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
1756 int nFullPathname = pVfs->mxPathname+1;
1757 char *zFullPathname = sqlite3Malloc(nFullPathname);
1758 sqlite3_mutex *mutexShared;
1759 p->sharable = 1;
1760 if( !zFullPathname ){
1761 sqlite3_free(p);
1762 return SQLITE_NOMEM;
1763 }
1764 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
1765 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1766 sqlite3_mutex_enter(mutexOpen);
1767 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1768 sqlite3_mutex_enter(mutexShared);
1769 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
1770 assert( pBt->nRef>0 );
1771 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1772 && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1773 int iDb;
1774 for(iDb=db->nDb-1; iDb>=0; iDb--){
1775 Btree *pExisting = db->aDb[iDb].pBt;
1776 if( pExisting && pExisting->pBt==pBt ){
1777 sqlite3_mutex_leave(mutexShared);
1778 sqlite3_mutex_leave(mutexOpen);
1779 sqlite3_free(zFullPathname);
1780 sqlite3_free(p);
1781 return SQLITE_CONSTRAINT;
1782 }
1783 }
1784 p->pBt = pBt;
1785 pBt->nRef++;
1786 break;
1787 }
1788 }
1789 sqlite3_mutex_leave(mutexShared);
1790 sqlite3_free(zFullPathname);
1791 }
1792 #ifdef SQLITE_DEBUG
1793 else{
1794 /* In debug mode, we mark all persistent databases as sharable
1795 ** even when they are not. This exercises the locking code and
1796 ** gives more opportunity for asserts(sqlite3_mutex_held())
1797 ** statements to find locking problems.
1798 */
1799 p->sharable = 1;
1800 }
1801 #endif
1802 }
1803 #endif
1804 if( pBt==0 ){
1805 /*
1806 ** The following asserts make sure that structures used by the btree are
1807 ** the right size. This is to guard against size changes that result
1808 ** when compiling on a different architecture.
1809 */
1810 assert( sizeof(i64)==8 || sizeof(i64)==4 );
1811 assert( sizeof(u64)==8 || sizeof(u64)==4 );
1812 assert( sizeof(u32)==4 );
1813 assert( sizeof(u16)==2 );
1814 assert( sizeof(Pgno)==4 );
1815
1816 pBt = sqlite3MallocZero( sizeof(*pBt) );
1817 if( pBt==0 ){
1818 rc = SQLITE_NOMEM;
1819 goto btree_open_out;
1820 }
1821 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1822 EXTRA_SIZE, flags, vfsFlags, pageReinit);
1823 if( rc==SQLITE_OK ){
1824 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1825 }
1826 if( rc!=SQLITE_OK ){
1827 goto btree_open_out;
1828 }
1829 pBt->openFlags = (u8)flags;
1830 pBt->db = db;
1831 sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
1832 p->pBt = pBt;
1833
1834 pBt->pCursor = 0;
1835 pBt->pPage1 = 0;
1836 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1837 #ifdef SQLITE_SECURE_DELETE
1838 pBt->secureDelete = 1;
1839 #endif
1840 pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
1841 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1842 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1843 pBt->pageSize = 0;
1844 #ifndef SQLITE_OMIT_AUTOVACUUM
1845 /* If the magic name ":memory:" will create an in-memory database, then
1846 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1847 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1848 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1849 ** regular file-name. In this case the auto-vacuum applies as per normal.
1850 */
1851 if( zFilename && !isMemdb ){
1852 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1853 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1854 }
1855 #endif
1856 nReserve = 0;
1857 }else{
1858 nReserve = zDbHeader[20];
1859 pBt->pageSizeFixed = 1;
1860 #ifndef SQLITE_OMIT_AUTOVACUUM
1861 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1862 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1863 #endif
1864 }
1865 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
1866 if( rc ) goto btree_open_out;
1867 pBt->usableSize = pBt->pageSize - nReserve;
1868 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
1869
1870 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1871 /* Add the new BtShared object to the linked list sharable BtShareds.
1872 */
1873 if( p->sharable ){
1874 sqlite3_mutex *mutexShared;
1875 pBt->nRef = 1;
1876 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1877 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
1878 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1879 if( pBt->mutex==0 ){
1880 rc = SQLITE_NOMEM;
1881 db->mallocFailed = 0;
1882 goto btree_open_out;
1883 }
1884 }
1885 sqlite3_mutex_enter(mutexShared);
1886 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1887 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
1888 sqlite3_mutex_leave(mutexShared);
1889 }
1890 #endif
1891 }
1892
1893 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1894 /* If the new Btree uses a sharable pBtShared, then link the new
1895 ** Btree into the list of all sharable Btrees for the same connection.
1896 ** The list is kept in ascending order by pBt address.
1897 */
1898 if( p->sharable ){
1899 int i;
1900 Btree *pSib;
1901 for(i=0; i<db->nDb; i++){
1902 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1903 while( pSib->pPrev ){ pSib = pSib->pPrev; }
1904 if( p->pBt<pSib->pBt ){
1905 p->pNext = pSib;
1906 p->pPrev = 0;
1907 pSib->pPrev = p;
1908 }else{
1909 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1910 pSib = pSib->pNext;
1911 }
1912 p->pNext = pSib->pNext;
1913 p->pPrev = pSib;
1914 if( p->pNext ){
1915 p->pNext->pPrev = p;
1916 }
1917 pSib->pNext = p;
1918 }
1919 break;
1920 }
1921 }
1922 }
1923 #endif
1924 *ppBtree = p;
1925
1926 btree_open_out:
1927 if( rc!=SQLITE_OK ){
1928 if( pBt && pBt->pPager ){
1929 sqlite3PagerClose(pBt->pPager);
1930 }
1931 sqlite3_free(pBt);
1932 sqlite3_free(p);
1933 *ppBtree = 0;
1934 }else{
1935 /* If the B-Tree was successfully opened, set the pager-cache size to the
1936 ** default value. Except, when opening on an existing shared pager-cache,
1937 ** do not change the pager-cache size.
1938 */
1939 if( sqlite3BtreeSchema(p, 0, 0)==0 ){
1940 sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
1941 }
1942 }
1943 if( mutexOpen ){
1944 assert( sqlite3_mutex_held(mutexOpen) );
1945 sqlite3_mutex_leave(mutexOpen);
1946 }
1947 return rc;
1948 }
1949
1950 /*
1951 ** Decrement the BtShared.nRef counter. When it reaches zero,
1952 ** remove the BtShared structure from the sharing list. Return
1953 ** true if the BtShared.nRef counter reaches zero and return
1954 ** false if it is still positive.
1955 */
removeFromSharingList(BtShared * pBt)1956 static int removeFromSharingList(BtShared *pBt){
1957 #ifndef SQLITE_OMIT_SHARED_CACHE
1958 sqlite3_mutex *pMaster;
1959 BtShared *pList;
1960 int removed = 0;
1961
1962 assert( sqlite3_mutex_notheld(pBt->mutex) );
1963 pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1964 sqlite3_mutex_enter(pMaster);
1965 pBt->nRef--;
1966 if( pBt->nRef<=0 ){
1967 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1968 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
1969 }else{
1970 pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
1971 while( ALWAYS(pList) && pList->pNext!=pBt ){
1972 pList=pList->pNext;
1973 }
1974 if( ALWAYS(pList) ){
1975 pList->pNext = pBt->pNext;
1976 }
1977 }
1978 if( SQLITE_THREADSAFE ){
1979 sqlite3_mutex_free(pBt->mutex);
1980 }
1981 removed = 1;
1982 }
1983 sqlite3_mutex_leave(pMaster);
1984 return removed;
1985 #else
1986 return 1;
1987 #endif
1988 }
1989
1990 /*
1991 ** Make sure pBt->pTmpSpace points to an allocation of
1992 ** MX_CELL_SIZE(pBt) bytes.
1993 */
allocateTempSpace(BtShared * pBt)1994 static void allocateTempSpace(BtShared *pBt){
1995 if( !pBt->pTmpSpace ){
1996 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
1997 }
1998 }
1999
2000 /*
2001 ** Free the pBt->pTmpSpace allocation
2002 */
freeTempSpace(BtShared * pBt)2003 static void freeTempSpace(BtShared *pBt){
2004 sqlite3PageFree( pBt->pTmpSpace);
2005 pBt->pTmpSpace = 0;
2006 }
2007
2008 /*
2009 ** Close an open database and invalidate all cursors.
2010 */
sqlite3BtreeClose(Btree * p)2011 int sqlite3BtreeClose(Btree *p){
2012 BtShared *pBt = p->pBt;
2013 BtCursor *pCur;
2014
2015 /* Close all cursors opened via this handle. */
2016 assert( sqlite3_mutex_held(p->db->mutex) );
2017 sqlite3BtreeEnter(p);
2018 pCur = pBt->pCursor;
2019 while( pCur ){
2020 BtCursor *pTmp = pCur;
2021 pCur = pCur->pNext;
2022 if( pTmp->pBtree==p ){
2023 sqlite3BtreeCloseCursor(pTmp);
2024 }
2025 }
2026
2027 /* Rollback any active transaction and free the handle structure.
2028 ** The call to sqlite3BtreeRollback() drops any table-locks held by
2029 ** this handle.
2030 */
2031 sqlite3BtreeRollback(p);
2032 sqlite3BtreeLeave(p);
2033
2034 /* If there are still other outstanding references to the shared-btree
2035 ** structure, return now. The remainder of this procedure cleans
2036 ** up the shared-btree.
2037 */
2038 assert( p->wantToLock==0 && p->locked==0 );
2039 if( !p->sharable || removeFromSharingList(pBt) ){
2040 /* The pBt is no longer on the sharing list, so we can access
2041 ** it without having to hold the mutex.
2042 **
2043 ** Clean out and delete the BtShared object.
2044 */
2045 assert( !pBt->pCursor );
2046 sqlite3PagerClose(pBt->pPager);
2047 if( pBt->xFreeSchema && pBt->pSchema ){
2048 pBt->xFreeSchema(pBt->pSchema);
2049 }
2050 sqlite3DbFree(0, pBt->pSchema);
2051 freeTempSpace(pBt);
2052 sqlite3_free(pBt);
2053 }
2054
2055 #ifndef SQLITE_OMIT_SHARED_CACHE
2056 assert( p->wantToLock==0 );
2057 assert( p->locked==0 );
2058 if( p->pPrev ) p->pPrev->pNext = p->pNext;
2059 if( p->pNext ) p->pNext->pPrev = p->pPrev;
2060 #endif
2061
2062 sqlite3_free(p);
2063 return SQLITE_OK;
2064 }
2065
2066 /*
2067 ** Change the limit on the number of pages allowed in the cache.
2068 **
2069 ** The maximum number of cache pages is set to the absolute
2070 ** value of mxPage. If mxPage is negative, the pager will
2071 ** operate asynchronously - it will not stop to do fsync()s
2072 ** to insure data is written to the disk surface before
2073 ** continuing. Transactions still work if synchronous is off,
2074 ** and the database cannot be corrupted if this program
2075 ** crashes. But if the operating system crashes or there is
2076 ** an abrupt power failure when synchronous is off, the database
2077 ** could be left in an inconsistent and unrecoverable state.
2078 ** Synchronous is on by default so database corruption is not
2079 ** normally a worry.
2080 */
sqlite3BtreeSetCacheSize(Btree * p,int mxPage)2081 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2082 BtShared *pBt = p->pBt;
2083 assert( sqlite3_mutex_held(p->db->mutex) );
2084 sqlite3BtreeEnter(p);
2085 sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2086 sqlite3BtreeLeave(p);
2087 return SQLITE_OK;
2088 }
2089
2090 /*
2091 ** Change the way data is synced to disk in order to increase or decrease
2092 ** how well the database resists damage due to OS crashes and power
2093 ** failures. Level 1 is the same as asynchronous (no syncs() occur and
2094 ** there is a high probability of damage) Level 2 is the default. There
2095 ** is a very low but non-zero probability of damage. Level 3 reduces the
2096 ** probability of damage to near zero but with a write performance reduction.
2097 */
2098 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
sqlite3BtreeSetSafetyLevel(Btree * p,int level,int fullSync,int ckptFullSync)2099 int sqlite3BtreeSetSafetyLevel(
2100 Btree *p, /* The btree to set the safety level on */
2101 int level, /* PRAGMA synchronous. 1=OFF, 2=NORMAL, 3=FULL */
2102 int fullSync, /* PRAGMA fullfsync. */
2103 int ckptFullSync /* PRAGMA checkpoint_fullfync */
2104 ){
2105 BtShared *pBt = p->pBt;
2106 assert( sqlite3_mutex_held(p->db->mutex) );
2107 assert( level>=1 && level<=3 );
2108 sqlite3BtreeEnter(p);
2109 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync, ckptFullSync);
2110 sqlite3BtreeLeave(p);
2111 return SQLITE_OK;
2112 }
2113 #endif
2114
2115 /*
2116 ** Return TRUE if the given btree is set to safety level 1. In other
2117 ** words, return TRUE if no sync() occurs on the disk files.
2118 */
sqlite3BtreeSyncDisabled(Btree * p)2119 int sqlite3BtreeSyncDisabled(Btree *p){
2120 BtShared *pBt = p->pBt;
2121 int rc;
2122 assert( sqlite3_mutex_held(p->db->mutex) );
2123 sqlite3BtreeEnter(p);
2124 assert( pBt && pBt->pPager );
2125 rc = sqlite3PagerNosync(pBt->pPager);
2126 sqlite3BtreeLeave(p);
2127 return rc;
2128 }
2129
2130 /*
2131 ** Change the default pages size and the number of reserved bytes per page.
2132 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2133 ** without changing anything.
2134 **
2135 ** The page size must be a power of 2 between 512 and 65536. If the page
2136 ** size supplied does not meet this constraint then the page size is not
2137 ** changed.
2138 **
2139 ** Page sizes are constrained to be a power of two so that the region
2140 ** of the database file used for locking (beginning at PENDING_BYTE,
2141 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2142 ** at the beginning of a page.
2143 **
2144 ** If parameter nReserve is less than zero, then the number of reserved
2145 ** bytes per page is left unchanged.
2146 **
2147 ** If the iFix!=0 then the pageSizeFixed flag is set so that the page size
2148 ** and autovacuum mode can no longer be changed.
2149 */
sqlite3BtreeSetPageSize(Btree * p,int pageSize,int nReserve,int iFix)2150 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2151 int rc = SQLITE_OK;
2152 BtShared *pBt = p->pBt;
2153 assert( nReserve>=-1 && nReserve<=255 );
2154 sqlite3BtreeEnter(p);
2155 if( pBt->pageSizeFixed ){
2156 sqlite3BtreeLeave(p);
2157 return SQLITE_READONLY;
2158 }
2159 if( nReserve<0 ){
2160 nReserve = pBt->pageSize - pBt->usableSize;
2161 }
2162 assert( nReserve>=0 && nReserve<=255 );
2163 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2164 ((pageSize-1)&pageSize)==0 ){
2165 assert( (pageSize & 7)==0 );
2166 assert( !pBt->pPage1 && !pBt->pCursor );
2167 pBt->pageSize = (u32)pageSize;
2168 freeTempSpace(pBt);
2169 }
2170 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2171 pBt->usableSize = pBt->pageSize - (u16)nReserve;
2172 if( iFix ) pBt->pageSizeFixed = 1;
2173 sqlite3BtreeLeave(p);
2174 return rc;
2175 }
2176
2177 /*
2178 ** Return the currently defined page size
2179 */
sqlite3BtreeGetPageSize(Btree * p)2180 int sqlite3BtreeGetPageSize(Btree *p){
2181 return p->pBt->pageSize;
2182 }
2183
2184 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
2185 /*
2186 ** Return the number of bytes of space at the end of every page that
2187 ** are intentually left unused. This is the "reserved" space that is
2188 ** sometimes used by extensions.
2189 */
sqlite3BtreeGetReserve(Btree * p)2190 int sqlite3BtreeGetReserve(Btree *p){
2191 int n;
2192 sqlite3BtreeEnter(p);
2193 n = p->pBt->pageSize - p->pBt->usableSize;
2194 sqlite3BtreeLeave(p);
2195 return n;
2196 }
2197
2198 /*
2199 ** Set the maximum page count for a database if mxPage is positive.
2200 ** No changes are made if mxPage is 0 or negative.
2201 ** Regardless of the value of mxPage, return the maximum page count.
2202 */
sqlite3BtreeMaxPageCount(Btree * p,int mxPage)2203 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
2204 int n;
2205 sqlite3BtreeEnter(p);
2206 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2207 sqlite3BtreeLeave(p);
2208 return n;
2209 }
2210
2211 /*
2212 ** Set the secureDelete flag if newFlag is 0 or 1. If newFlag is -1,
2213 ** then make no changes. Always return the value of the secureDelete
2214 ** setting after the change.
2215 */
sqlite3BtreeSecureDelete(Btree * p,int newFlag)2216 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2217 int b;
2218 if( p==0 ) return 0;
2219 sqlite3BtreeEnter(p);
2220 if( newFlag>=0 ){
2221 p->pBt->secureDelete = (newFlag!=0) ? 1 : 0;
2222 }
2223 b = p->pBt->secureDelete;
2224 sqlite3BtreeLeave(p);
2225 return b;
2226 }
2227 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
2228
2229 /*
2230 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2231 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2232 ** is disabled. The default value for the auto-vacuum property is
2233 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2234 */
sqlite3BtreeSetAutoVacuum(Btree * p,int autoVacuum)2235 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2236 #ifdef SQLITE_OMIT_AUTOVACUUM
2237 return SQLITE_READONLY;
2238 #else
2239 BtShared *pBt = p->pBt;
2240 int rc = SQLITE_OK;
2241 u8 av = (u8)autoVacuum;
2242
2243 sqlite3BtreeEnter(p);
2244 if( pBt->pageSizeFixed && (av ?1:0)!=pBt->autoVacuum ){
2245 rc = SQLITE_READONLY;
2246 }else{
2247 pBt->autoVacuum = av ?1:0;
2248 pBt->incrVacuum = av==2 ?1:0;
2249 }
2250 sqlite3BtreeLeave(p);
2251 return rc;
2252 #endif
2253 }
2254
2255 /*
2256 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
2257 ** enabled 1 is returned. Otherwise 0.
2258 */
sqlite3BtreeGetAutoVacuum(Btree * p)2259 int sqlite3BtreeGetAutoVacuum(Btree *p){
2260 #ifdef SQLITE_OMIT_AUTOVACUUM
2261 return BTREE_AUTOVACUUM_NONE;
2262 #else
2263 int rc;
2264 sqlite3BtreeEnter(p);
2265 rc = (
2266 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
2267 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
2268 BTREE_AUTOVACUUM_INCR
2269 );
2270 sqlite3BtreeLeave(p);
2271 return rc;
2272 #endif
2273 }
2274
2275
2276 /*
2277 ** Get a reference to pPage1 of the database file. This will
2278 ** also acquire a readlock on that file.
2279 **
2280 ** SQLITE_OK is returned on success. If the file is not a
2281 ** well-formed database file, then SQLITE_CORRUPT is returned.
2282 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
2283 ** is returned if we run out of memory.
2284 */
lockBtree(BtShared * pBt)2285 static int lockBtree(BtShared *pBt){
2286 int rc; /* Result code from subfunctions */
2287 MemPage *pPage1; /* Page 1 of the database file */
2288 int nPage; /* Number of pages in the database */
2289 int nPageFile = 0; /* Number of pages in the database file */
2290 int nPageHeader; /* Number of pages in the database according to hdr */
2291
2292 assert( sqlite3_mutex_held(pBt->mutex) );
2293 assert( pBt->pPage1==0 );
2294 rc = sqlite3PagerSharedLock(pBt->pPager);
2295 if( rc!=SQLITE_OK ) return rc;
2296 rc = btreeGetPage(pBt, 1, &pPage1, 0);
2297 if( rc!=SQLITE_OK ) return rc;
2298
2299 /* Do some checking to help insure the file we opened really is
2300 ** a valid database file.
2301 */
2302 nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
2303 sqlite3PagerPagecount(pBt->pPager, &nPageFile);
2304 if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
2305 nPage = nPageFile;
2306 }
2307 if( nPage>0 ){
2308 u32 pageSize;
2309 u32 usableSize;
2310 u8 *page1 = pPage1->aData;
2311 rc = SQLITE_NOTADB;
2312 if( memcmp(page1, zMagicHeader, 16)!=0 ){
2313 goto page1_init_failed;
2314 }
2315
2316 #ifdef SQLITE_OMIT_WAL
2317 if( page1[18]>1 ){
2318 pBt->readOnly = 1;
2319 }
2320 if( page1[19]>1 ){
2321 goto page1_init_failed;
2322 }
2323 #else
2324 if( page1[18]>2 ){
2325 pBt->readOnly = 1;
2326 }
2327 if( page1[19]>2 ){
2328 goto page1_init_failed;
2329 }
2330
2331 /* If the write version is set to 2, this database should be accessed
2332 ** in WAL mode. If the log is not already open, open it now. Then
2333 ** return SQLITE_OK and return without populating BtShared.pPage1.
2334 ** The caller detects this and calls this function again. This is
2335 ** required as the version of page 1 currently in the page1 buffer
2336 ** may not be the latest version - there may be a newer one in the log
2337 ** file.
2338 */
2339 if( page1[19]==2 && pBt->doNotUseWAL==0 ){
2340 int isOpen = 0;
2341 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
2342 if( rc!=SQLITE_OK ){
2343 goto page1_init_failed;
2344 }else if( isOpen==0 ){
2345 releasePage(pPage1);
2346 return SQLITE_OK;
2347 }
2348 rc = SQLITE_NOTADB;
2349 }
2350 #endif
2351
2352 /* The maximum embedded fraction must be exactly 25%. And the minimum
2353 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
2354 ** The original design allowed these amounts to vary, but as of
2355 ** version 3.6.0, we require them to be fixed.
2356 */
2357 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
2358 goto page1_init_failed;
2359 }
2360 pageSize = (page1[16]<<8) | (page1[17]<<16);
2361 if( ((pageSize-1)&pageSize)!=0
2362 || pageSize>SQLITE_MAX_PAGE_SIZE
2363 || pageSize<=256
2364 ){
2365 goto page1_init_failed;
2366 }
2367 assert( (pageSize & 7)==0 );
2368 usableSize = pageSize - page1[20];
2369 if( (u32)pageSize!=pBt->pageSize ){
2370 /* After reading the first page of the database assuming a page size
2371 ** of BtShared.pageSize, we have discovered that the page-size is
2372 ** actually pageSize. Unlock the database, leave pBt->pPage1 at
2373 ** zero and return SQLITE_OK. The caller will call this function
2374 ** again with the correct page-size.
2375 */
2376 releasePage(pPage1);
2377 pBt->usableSize = usableSize;
2378 pBt->pageSize = pageSize;
2379 freeTempSpace(pBt);
2380 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
2381 pageSize-usableSize);
2382 return rc;
2383 }
2384 if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){
2385 rc = SQLITE_CORRUPT_BKPT;
2386 goto page1_init_failed;
2387 }
2388 if( usableSize<480 ){
2389 goto page1_init_failed;
2390 }
2391 pBt->pageSize = pageSize;
2392 pBt->usableSize = usableSize;
2393 #ifndef SQLITE_OMIT_AUTOVACUUM
2394 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
2395 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
2396 #endif
2397 }
2398
2399 /* maxLocal is the maximum amount of payload to store locally for
2400 ** a cell. Make sure it is small enough so that at least minFanout
2401 ** cells can will fit on one page. We assume a 10-byte page header.
2402 ** Besides the payload, the cell must store:
2403 ** 2-byte pointer to the cell
2404 ** 4-byte child pointer
2405 ** 9-byte nKey value
2406 ** 4-byte nData value
2407 ** 4-byte overflow page pointer
2408 ** So a cell consists of a 2-byte pointer, a header which is as much as
2409 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
2410 ** page pointer.
2411 */
2412 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
2413 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
2414 pBt->maxLeaf = (u16)(pBt->usableSize - 35);
2415 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
2416 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
2417 pBt->pPage1 = pPage1;
2418 pBt->nPage = nPage;
2419 return SQLITE_OK;
2420
2421 page1_init_failed:
2422 releasePage(pPage1);
2423 pBt->pPage1 = 0;
2424 return rc;
2425 }
2426
2427 /*
2428 ** If there are no outstanding cursors and we are not in the middle
2429 ** of a transaction but there is a read lock on the database, then
2430 ** this routine unrefs the first page of the database file which
2431 ** has the effect of releasing the read lock.
2432 **
2433 ** If there is a transaction in progress, this routine is a no-op.
2434 */
unlockBtreeIfUnused(BtShared * pBt)2435 static void unlockBtreeIfUnused(BtShared *pBt){
2436 assert( sqlite3_mutex_held(pBt->mutex) );
2437 assert( pBt->pCursor==0 || pBt->inTransaction>TRANS_NONE );
2438 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
2439 assert( pBt->pPage1->aData );
2440 assert( sqlite3PagerRefcount(pBt->pPager)==1 );
2441 assert( pBt->pPage1->aData );
2442 releasePage(pBt->pPage1);
2443 pBt->pPage1 = 0;
2444 }
2445 }
2446
2447 /*
2448 ** If pBt points to an empty file then convert that empty file
2449 ** into a new empty database by initializing the first page of
2450 ** the database.
2451 */
newDatabase(BtShared * pBt)2452 static int newDatabase(BtShared *pBt){
2453 MemPage *pP1;
2454 unsigned char *data;
2455 int rc;
2456
2457 assert( sqlite3_mutex_held(pBt->mutex) );
2458 if( pBt->nPage>0 ){
2459 return SQLITE_OK;
2460 }
2461 pP1 = pBt->pPage1;
2462 assert( pP1!=0 );
2463 data = pP1->aData;
2464 rc = sqlite3PagerWrite(pP1->pDbPage);
2465 if( rc ) return rc;
2466 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
2467 assert( sizeof(zMagicHeader)==16 );
2468 data[16] = (u8)((pBt->pageSize>>8)&0xff);
2469 data[17] = (u8)((pBt->pageSize>>16)&0xff);
2470 data[18] = 1;
2471 data[19] = 1;
2472 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
2473 data[20] = (u8)(pBt->pageSize - pBt->usableSize);
2474 data[21] = 64;
2475 data[22] = 32;
2476 data[23] = 32;
2477 memset(&data[24], 0, 100-24);
2478 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
2479 pBt->pageSizeFixed = 1;
2480 #ifndef SQLITE_OMIT_AUTOVACUUM
2481 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
2482 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
2483 put4byte(&data[36 + 4*4], pBt->autoVacuum);
2484 put4byte(&data[36 + 7*4], pBt->incrVacuum);
2485 #endif
2486 pBt->nPage = 1;
2487 data[31] = 1;
2488 return SQLITE_OK;
2489 }
2490
2491 /*
2492 ** Attempt to start a new transaction. A write-transaction
2493 ** is started if the second argument is nonzero, otherwise a read-
2494 ** transaction. If the second argument is 2 or more and exclusive
2495 ** transaction is started, meaning that no other process is allowed
2496 ** to access the database. A preexisting transaction may not be
2497 ** upgraded to exclusive by calling this routine a second time - the
2498 ** exclusivity flag only works for a new transaction.
2499 **
2500 ** A write-transaction must be started before attempting any
2501 ** changes to the database. None of the following routines
2502 ** will work unless a transaction is started first:
2503 **
2504 ** sqlite3BtreeCreateTable()
2505 ** sqlite3BtreeCreateIndex()
2506 ** sqlite3BtreeClearTable()
2507 ** sqlite3BtreeDropTable()
2508 ** sqlite3BtreeInsert()
2509 ** sqlite3BtreeDelete()
2510 ** sqlite3BtreeUpdateMeta()
2511 **
2512 ** If an initial attempt to acquire the lock fails because of lock contention
2513 ** and the database was previously unlocked, then invoke the busy handler
2514 ** if there is one. But if there was previously a read-lock, do not
2515 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
2516 ** returned when there is already a read-lock in order to avoid a deadlock.
2517 **
2518 ** Suppose there are two processes A and B. A has a read lock and B has
2519 ** a reserved lock. B tries to promote to exclusive but is blocked because
2520 ** of A's read lock. A tries to promote to reserved but is blocked by B.
2521 ** One or the other of the two processes must give way or there can be
2522 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback
2523 ** when A already has a read lock, we encourage A to give up and let B
2524 ** proceed.
2525 */
sqlite3BtreeBeginTrans(Btree * p,int wrflag)2526 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
2527 sqlite3 *pBlock = 0;
2528 BtShared *pBt = p->pBt;
2529 int rc = SQLITE_OK;
2530
2531 sqlite3BtreeEnter(p);
2532 btreeIntegrity(p);
2533
2534 /* If the btree is already in a write-transaction, or it
2535 ** is already in a read-transaction and a read-transaction
2536 ** is requested, this is a no-op.
2537 */
2538 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
2539 goto trans_begun;
2540 }
2541
2542 /* Write transactions are not possible on a read-only database */
2543 if( pBt->readOnly && wrflag ){
2544 rc = SQLITE_READONLY;
2545 goto trans_begun;
2546 }
2547
2548 #ifndef SQLITE_OMIT_SHARED_CACHE
2549 /* If another database handle has already opened a write transaction
2550 ** on this shared-btree structure and a second write transaction is
2551 ** requested, return SQLITE_LOCKED.
2552 */
2553 if( (wrflag && pBt->inTransaction==TRANS_WRITE) || pBt->isPending ){
2554 pBlock = pBt->pWriter->db;
2555 }else if( wrflag>1 ){
2556 BtLock *pIter;
2557 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
2558 if( pIter->pBtree!=p ){
2559 pBlock = pIter->pBtree->db;
2560 break;
2561 }
2562 }
2563 }
2564 if( pBlock ){
2565 sqlite3ConnectionBlocked(p->db, pBlock);
2566 rc = SQLITE_LOCKED_SHAREDCACHE;
2567 goto trans_begun;
2568 }
2569 #endif
2570
2571 /* Any read-only or read-write transaction implies a read-lock on
2572 ** page 1. So if some other shared-cache client already has a write-lock
2573 ** on page 1, the transaction cannot be opened. */
2574 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
2575 if( SQLITE_OK!=rc ) goto trans_begun;
2576
2577 pBt->initiallyEmpty = (u8)(pBt->nPage==0);
2578 do {
2579 /* Call lockBtree() until either pBt->pPage1 is populated or
2580 ** lockBtree() returns something other than SQLITE_OK. lockBtree()
2581 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
2582 ** reading page 1 it discovers that the page-size of the database
2583 ** file is not pBt->pageSize. In this case lockBtree() will update
2584 ** pBt->pageSize to the page-size of the file on disk.
2585 */
2586 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
2587
2588 if( rc==SQLITE_OK && wrflag ){
2589 if( pBt->readOnly ){
2590 rc = SQLITE_READONLY;
2591 }else{
2592 rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
2593 if( rc==SQLITE_OK ){
2594 rc = newDatabase(pBt);
2595 }
2596 }
2597 }
2598
2599 if( rc!=SQLITE_OK ){
2600 unlockBtreeIfUnused(pBt);
2601 }
2602 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
2603 btreeInvokeBusyHandler(pBt) );
2604
2605 if( rc==SQLITE_OK ){
2606 if( p->inTrans==TRANS_NONE ){
2607 pBt->nTransaction++;
2608 #ifndef SQLITE_OMIT_SHARED_CACHE
2609 if( p->sharable ){
2610 assert( p->lock.pBtree==p && p->lock.iTable==1 );
2611 p->lock.eLock = READ_LOCK;
2612 p->lock.pNext = pBt->pLock;
2613 pBt->pLock = &p->lock;
2614 }
2615 #endif
2616 }
2617 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2618 if( p->inTrans>pBt->inTransaction ){
2619 pBt->inTransaction = p->inTrans;
2620 }
2621 if( wrflag ){
2622 MemPage *pPage1 = pBt->pPage1;
2623 #ifndef SQLITE_OMIT_SHARED_CACHE
2624 assert( !pBt->pWriter );
2625 pBt->pWriter = p;
2626 pBt->isExclusive = (u8)(wrflag>1);
2627 #endif
2628
2629 /* If the db-size header field is incorrect (as it may be if an old
2630 ** client has been writing the database file), update it now. Doing
2631 ** this sooner rather than later means the database size can safely
2632 ** re-read the database size from page 1 if a savepoint or transaction
2633 ** rollback occurs within the transaction.
2634 */
2635 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
2636 rc = sqlite3PagerWrite(pPage1->pDbPage);
2637 if( rc==SQLITE_OK ){
2638 put4byte(&pPage1->aData[28], pBt->nPage);
2639 }
2640 }
2641 }
2642 }
2643
2644
2645 trans_begun:
2646 if( rc==SQLITE_OK && wrflag ){
2647 /* This call makes sure that the pager has the correct number of
2648 ** open savepoints. If the second parameter is greater than 0 and
2649 ** the sub-journal is not already open, then it will be opened here.
2650 */
2651 rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
2652 }
2653
2654 btreeIntegrity(p);
2655 sqlite3BtreeLeave(p);
2656 return rc;
2657 }
2658
2659 #ifndef SQLITE_OMIT_AUTOVACUUM
2660
2661 /*
2662 ** Set the pointer-map entries for all children of page pPage. Also, if
2663 ** pPage contains cells that point to overflow pages, set the pointer
2664 ** map entries for the overflow pages as well.
2665 */
setChildPtrmaps(MemPage * pPage)2666 static int setChildPtrmaps(MemPage *pPage){
2667 int i; /* Counter variable */
2668 int nCell; /* Number of cells in page pPage */
2669 int rc; /* Return code */
2670 BtShared *pBt = pPage->pBt;
2671 u8 isInitOrig = pPage->isInit;
2672 Pgno pgno = pPage->pgno;
2673
2674 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2675 rc = btreeInitPage(pPage);
2676 if( rc!=SQLITE_OK ){
2677 goto set_child_ptrmaps_out;
2678 }
2679 nCell = pPage->nCell;
2680
2681 for(i=0; i<nCell; i++){
2682 u8 *pCell = findCell(pPage, i);
2683
2684 ptrmapPutOvflPtr(pPage, pCell, &rc);
2685
2686 if( !pPage->leaf ){
2687 Pgno childPgno = get4byte(pCell);
2688 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
2689 }
2690 }
2691
2692 if( !pPage->leaf ){
2693 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2694 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
2695 }
2696
2697 set_child_ptrmaps_out:
2698 pPage->isInit = isInitOrig;
2699 return rc;
2700 }
2701
2702 /*
2703 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so
2704 ** that it points to iTo. Parameter eType describes the type of pointer to
2705 ** be modified, as follows:
2706 **
2707 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
2708 ** page of pPage.
2709 **
2710 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2711 ** page pointed to by one of the cells on pPage.
2712 **
2713 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2714 ** overflow page in the list.
2715 */
modifyPagePointer(MemPage * pPage,Pgno iFrom,Pgno iTo,u8 eType)2716 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
2717 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2718 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
2719 if( eType==PTRMAP_OVERFLOW2 ){
2720 /* The pointer is always the first 4 bytes of the page in this case. */
2721 if( get4byte(pPage->aData)!=iFrom ){
2722 return SQLITE_CORRUPT_BKPT;
2723 }
2724 put4byte(pPage->aData, iTo);
2725 }else{
2726 u8 isInitOrig = pPage->isInit;
2727 int i;
2728 int nCell;
2729
2730 btreeInitPage(pPage);
2731 nCell = pPage->nCell;
2732
2733 for(i=0; i<nCell; i++){
2734 u8 *pCell = findCell(pPage, i);
2735 if( eType==PTRMAP_OVERFLOW1 ){
2736 CellInfo info;
2737 btreeParseCellPtr(pPage, pCell, &info);
2738 if( info.iOverflow ){
2739 if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2740 put4byte(&pCell[info.iOverflow], iTo);
2741 break;
2742 }
2743 }
2744 }else{
2745 if( get4byte(pCell)==iFrom ){
2746 put4byte(pCell, iTo);
2747 break;
2748 }
2749 }
2750 }
2751
2752 if( i==nCell ){
2753 if( eType!=PTRMAP_BTREE ||
2754 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
2755 return SQLITE_CORRUPT_BKPT;
2756 }
2757 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2758 }
2759
2760 pPage->isInit = isInitOrig;
2761 }
2762 return SQLITE_OK;
2763 }
2764
2765
2766 /*
2767 ** Move the open database page pDbPage to location iFreePage in the
2768 ** database. The pDbPage reference remains valid.
2769 **
2770 ** The isCommit flag indicates that there is no need to remember that
2771 ** the journal needs to be sync()ed before database page pDbPage->pgno
2772 ** can be written to. The caller has already promised not to write to that
2773 ** page.
2774 */
relocatePage(BtShared * pBt,MemPage * pDbPage,u8 eType,Pgno iPtrPage,Pgno iFreePage,int isCommit)2775 static int relocatePage(
2776 BtShared *pBt, /* Btree */
2777 MemPage *pDbPage, /* Open page to move */
2778 u8 eType, /* Pointer map 'type' entry for pDbPage */
2779 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
2780 Pgno iFreePage, /* The location to move pDbPage to */
2781 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */
2782 ){
2783 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
2784 Pgno iDbPage = pDbPage->pgno;
2785 Pager *pPager = pBt->pPager;
2786 int rc;
2787
2788 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2789 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2790 assert( sqlite3_mutex_held(pBt->mutex) );
2791 assert( pDbPage->pBt==pBt );
2792
2793 /* Move page iDbPage from its current location to page number iFreePage */
2794 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2795 iDbPage, iFreePage, iPtrPage, eType));
2796 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
2797 if( rc!=SQLITE_OK ){
2798 return rc;
2799 }
2800 pDbPage->pgno = iFreePage;
2801
2802 /* If pDbPage was a btree-page, then it may have child pages and/or cells
2803 ** that point to overflow pages. The pointer map entries for all these
2804 ** pages need to be changed.
2805 **
2806 ** If pDbPage is an overflow page, then the first 4 bytes may store a
2807 ** pointer to a subsequent overflow page. If this is the case, then
2808 ** the pointer map needs to be updated for the subsequent overflow page.
2809 */
2810 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2811 rc = setChildPtrmaps(pDbPage);
2812 if( rc!=SQLITE_OK ){
2813 return rc;
2814 }
2815 }else{
2816 Pgno nextOvfl = get4byte(pDbPage->aData);
2817 if( nextOvfl!=0 ){
2818 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
2819 if( rc!=SQLITE_OK ){
2820 return rc;
2821 }
2822 }
2823 }
2824
2825 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2826 ** that it points at iFreePage. Also fix the pointer map entry for
2827 ** iPtrPage.
2828 */
2829 if( eType!=PTRMAP_ROOTPAGE ){
2830 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
2831 if( rc!=SQLITE_OK ){
2832 return rc;
2833 }
2834 rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2835 if( rc!=SQLITE_OK ){
2836 releasePage(pPtrPage);
2837 return rc;
2838 }
2839 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
2840 releasePage(pPtrPage);
2841 if( rc==SQLITE_OK ){
2842 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
2843 }
2844 }
2845 return rc;
2846 }
2847
2848 /* Forward declaration required by incrVacuumStep(). */
2849 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
2850
2851 /*
2852 ** Perform a single step of an incremental-vacuum. If successful,
2853 ** return SQLITE_OK. If there is no work to do (and therefore no
2854 ** point in calling this function again), return SQLITE_DONE.
2855 **
2856 ** More specificly, this function attempts to re-organize the
2857 ** database so that the last page of the file currently in use
2858 ** is no longer in use.
2859 **
2860 ** If the nFin parameter is non-zero, this function assumes
2861 ** that the caller will keep calling incrVacuumStep() until
2862 ** it returns SQLITE_DONE or an error, and that nFin is the
2863 ** number of pages the database file will contain after this
2864 ** process is complete. If nFin is zero, it is assumed that
2865 ** incrVacuumStep() will be called a finite amount of times
2866 ** which may or may not empty the freelist. A full autovacuum
2867 ** has nFin>0. A "PRAGMA incremental_vacuum" has nFin==0.
2868 */
incrVacuumStep(BtShared * pBt,Pgno nFin,Pgno iLastPg)2869 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg){
2870 Pgno nFreeList; /* Number of pages still on the free-list */
2871 int rc;
2872
2873 assert( sqlite3_mutex_held(pBt->mutex) );
2874 assert( iLastPg>nFin );
2875
2876 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2877 u8 eType;
2878 Pgno iPtrPage;
2879
2880 nFreeList = get4byte(&pBt->pPage1->aData[36]);
2881 if( nFreeList==0 ){
2882 return SQLITE_DONE;
2883 }
2884
2885 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2886 if( rc!=SQLITE_OK ){
2887 return rc;
2888 }
2889 if( eType==PTRMAP_ROOTPAGE ){
2890 return SQLITE_CORRUPT_BKPT;
2891 }
2892
2893 if( eType==PTRMAP_FREEPAGE ){
2894 if( nFin==0 ){
2895 /* Remove the page from the files free-list. This is not required
2896 ** if nFin is non-zero. In that case, the free-list will be
2897 ** truncated to zero after this function returns, so it doesn't
2898 ** matter if it still contains some garbage entries.
2899 */
2900 Pgno iFreePg;
2901 MemPage *pFreePg;
2902 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2903 if( rc!=SQLITE_OK ){
2904 return rc;
2905 }
2906 assert( iFreePg==iLastPg );
2907 releasePage(pFreePg);
2908 }
2909 } else {
2910 Pgno iFreePg; /* Index of free page to move pLastPg to */
2911 MemPage *pLastPg;
2912
2913 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
2914 if( rc!=SQLITE_OK ){
2915 return rc;
2916 }
2917
2918 /* If nFin is zero, this loop runs exactly once and page pLastPg
2919 ** is swapped with the first free page pulled off the free list.
2920 **
2921 ** On the other hand, if nFin is greater than zero, then keep
2922 ** looping until a free-page located within the first nFin pages
2923 ** of the file is found.
2924 */
2925 do {
2926 MemPage *pFreePg;
2927 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2928 if( rc!=SQLITE_OK ){
2929 releasePage(pLastPg);
2930 return rc;
2931 }
2932 releasePage(pFreePg);
2933 }while( nFin!=0 && iFreePg>nFin );
2934 assert( iFreePg<iLastPg );
2935
2936 rc = sqlite3PagerWrite(pLastPg->pDbPage);
2937 if( rc==SQLITE_OK ){
2938 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
2939 }
2940 releasePage(pLastPg);
2941 if( rc!=SQLITE_OK ){
2942 return rc;
2943 }
2944 }
2945 }
2946
2947 if( nFin==0 ){
2948 iLastPg--;
2949 while( iLastPg==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, iLastPg) ){
2950 if( PTRMAP_ISPAGE(pBt, iLastPg) ){
2951 MemPage *pPg;
2952 rc = btreeGetPage(pBt, iLastPg, &pPg, 0);
2953 if( rc!=SQLITE_OK ){
2954 return rc;
2955 }
2956 rc = sqlite3PagerWrite(pPg->pDbPage);
2957 releasePage(pPg);
2958 if( rc!=SQLITE_OK ){
2959 return rc;
2960 }
2961 }
2962 iLastPg--;
2963 }
2964 sqlite3PagerTruncateImage(pBt->pPager, iLastPg);
2965 pBt->nPage = iLastPg;
2966 }
2967 return SQLITE_OK;
2968 }
2969
2970 /*
2971 ** A write-transaction must be opened before calling this function.
2972 ** It performs a single unit of work towards an incremental vacuum.
2973 **
2974 ** If the incremental vacuum is finished after this function has run,
2975 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
2976 ** SQLITE_OK is returned. Otherwise an SQLite error code.
2977 */
sqlite3BtreeIncrVacuum(Btree * p)2978 int sqlite3BtreeIncrVacuum(Btree *p){
2979 int rc;
2980 BtShared *pBt = p->pBt;
2981
2982 sqlite3BtreeEnter(p);
2983 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2984 if( !pBt->autoVacuum ){
2985 rc = SQLITE_DONE;
2986 }else{
2987 invalidateAllOverflowCache(pBt);
2988 rc = incrVacuumStep(pBt, 0, btreePagecount(pBt));
2989 if( rc==SQLITE_OK ){
2990 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
2991 put4byte(&pBt->pPage1->aData[28], pBt->nPage);
2992 }
2993 }
2994 sqlite3BtreeLeave(p);
2995 return rc;
2996 }
2997
2998 /*
2999 ** This routine is called prior to sqlite3PagerCommit when a transaction
3000 ** is commited for an auto-vacuum database.
3001 **
3002 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3003 ** the database file should be truncated to during the commit process.
3004 ** i.e. the database has been reorganized so that only the first *pnTrunc
3005 ** pages are in use.
3006 */
autoVacuumCommit(BtShared * pBt)3007 static int autoVacuumCommit(BtShared *pBt){
3008 int rc = SQLITE_OK;
3009 Pager *pPager = pBt->pPager;
3010 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
3011
3012 assert( sqlite3_mutex_held(pBt->mutex) );
3013 invalidateAllOverflowCache(pBt);
3014 assert(pBt->autoVacuum);
3015 if( !pBt->incrVacuum ){
3016 Pgno nFin; /* Number of pages in database after autovacuuming */
3017 Pgno nFree; /* Number of pages on the freelist initially */
3018 Pgno nPtrmap; /* Number of PtrMap pages to be freed */
3019 Pgno iFree; /* The next page to be freed */
3020 int nEntry; /* Number of entries on one ptrmap page */
3021 Pgno nOrig; /* Database size before freeing */
3022
3023 nOrig = btreePagecount(pBt);
3024 if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3025 /* It is not possible to create a database for which the final page
3026 ** is either a pointer-map page or the pending-byte page. If one
3027 ** is encountered, this indicates corruption.
3028 */
3029 return SQLITE_CORRUPT_BKPT;
3030 }
3031
3032 nFree = get4byte(&pBt->pPage1->aData[36]);
3033 nEntry = pBt->usableSize/5;
3034 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3035 nFin = nOrig - nFree - nPtrmap;
3036 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3037 nFin--;
3038 }
3039 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3040 nFin--;
3041 }
3042 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3043
3044 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3045 rc = incrVacuumStep(pBt, nFin, iFree);
3046 }
3047 if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3048 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3049 put4byte(&pBt->pPage1->aData[32], 0);
3050 put4byte(&pBt->pPage1->aData[36], 0);
3051 put4byte(&pBt->pPage1->aData[28], nFin);
3052 sqlite3PagerTruncateImage(pBt->pPager, nFin);
3053 pBt->nPage = nFin;
3054 }
3055 if( rc!=SQLITE_OK ){
3056 sqlite3PagerRollback(pPager);
3057 }
3058 }
3059
3060 assert( nRef==sqlite3PagerRefcount(pPager) );
3061 return rc;
3062 }
3063
3064 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3065 # define setChildPtrmaps(x) SQLITE_OK
3066 #endif
3067
3068 /*
3069 ** This routine does the first phase of a two-phase commit. This routine
3070 ** causes a rollback journal to be created (if it does not already exist)
3071 ** and populated with enough information so that if a power loss occurs
3072 ** the database can be restored to its original state by playing back
3073 ** the journal. Then the contents of the journal are flushed out to
3074 ** the disk. After the journal is safely on oxide, the changes to the
3075 ** database are written into the database file and flushed to oxide.
3076 ** At the end of this call, the rollback journal still exists on the
3077 ** disk and we are still holding all locks, so the transaction has not
3078 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the
3079 ** commit process.
3080 **
3081 ** This call is a no-op if no write-transaction is currently active on pBt.
3082 **
3083 ** Otherwise, sync the database file for the btree pBt. zMaster points to
3084 ** the name of a master journal file that should be written into the
3085 ** individual journal file, or is NULL, indicating no master journal file
3086 ** (single database transaction).
3087 **
3088 ** When this is called, the master journal should already have been
3089 ** created, populated with this journal pointer and synced to disk.
3090 **
3091 ** Once this is routine has returned, the only thing required to commit
3092 ** the write-transaction for this database file is to delete the journal.
3093 */
sqlite3BtreeCommitPhaseOne(Btree * p,const char * zMaster)3094 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
3095 int rc = SQLITE_OK;
3096 if( p->inTrans==TRANS_WRITE ){
3097 BtShared *pBt = p->pBt;
3098 sqlite3BtreeEnter(p);
3099 #ifndef SQLITE_OMIT_AUTOVACUUM
3100 if( pBt->autoVacuum ){
3101 rc = autoVacuumCommit(pBt);
3102 if( rc!=SQLITE_OK ){
3103 sqlite3BtreeLeave(p);
3104 return rc;
3105 }
3106 }
3107 #endif
3108 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
3109 sqlite3BtreeLeave(p);
3110 }
3111 return rc;
3112 }
3113
3114 /*
3115 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
3116 ** at the conclusion of a transaction.
3117 */
btreeEndTransaction(Btree * p)3118 static void btreeEndTransaction(Btree *p){
3119 BtShared *pBt = p->pBt;
3120 assert( sqlite3BtreeHoldsMutex(p) );
3121
3122 btreeClearHasContent(pBt);
3123 if( p->inTrans>TRANS_NONE && p->db->activeVdbeCnt>1 ){
3124 /* If there are other active statements that belong to this database
3125 ** handle, downgrade to a read-only transaction. The other statements
3126 ** may still be reading from the database. */
3127 downgradeAllSharedCacheTableLocks(p);
3128 p->inTrans = TRANS_READ;
3129 }else{
3130 /* If the handle had any kind of transaction open, decrement the
3131 ** transaction count of the shared btree. If the transaction count
3132 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
3133 ** call below will unlock the pager. */
3134 if( p->inTrans!=TRANS_NONE ){
3135 clearAllSharedCacheTableLocks(p);
3136 pBt->nTransaction--;
3137 if( 0==pBt->nTransaction ){
3138 pBt->inTransaction = TRANS_NONE;
3139 }
3140 }
3141
3142 /* Set the current transaction state to TRANS_NONE and unlock the
3143 ** pager if this call closed the only read or write transaction. */
3144 p->inTrans = TRANS_NONE;
3145 unlockBtreeIfUnused(pBt);
3146 }
3147
3148 btreeIntegrity(p);
3149 }
3150
3151 /*
3152 ** Commit the transaction currently in progress.
3153 **
3154 ** This routine implements the second phase of a 2-phase commit. The
3155 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
3156 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne()
3157 ** routine did all the work of writing information out to disk and flushing the
3158 ** contents so that they are written onto the disk platter. All this
3159 ** routine has to do is delete or truncate or zero the header in the
3160 ** the rollback journal (which causes the transaction to commit) and
3161 ** drop locks.
3162 **
3163 ** Normally, if an error occurs while the pager layer is attempting to
3164 ** finalize the underlying journal file, this function returns an error and
3165 ** the upper layer will attempt a rollback. However, if the second argument
3166 ** is non-zero then this b-tree transaction is part of a multi-file
3167 ** transaction. In this case, the transaction has already been committed
3168 ** (by deleting a master journal file) and the caller will ignore this
3169 ** functions return code. So, even if an error occurs in the pager layer,
3170 ** reset the b-tree objects internal state to indicate that the write
3171 ** transaction has been closed. This is quite safe, as the pager will have
3172 ** transitioned to the error state.
3173 **
3174 ** This will release the write lock on the database file. If there
3175 ** are no active cursors, it also releases the read lock.
3176 */
sqlite3BtreeCommitPhaseTwo(Btree * p,int bCleanup)3177 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
3178
3179 if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
3180 sqlite3BtreeEnter(p);
3181 btreeIntegrity(p);
3182
3183 /* If the handle has a write-transaction open, commit the shared-btrees
3184 ** transaction and set the shared state to TRANS_READ.
3185 */
3186 if( p->inTrans==TRANS_WRITE ){
3187 int rc;
3188 BtShared *pBt = p->pBt;
3189 assert( pBt->inTransaction==TRANS_WRITE );
3190 assert( pBt->nTransaction>0 );
3191 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
3192 if( rc!=SQLITE_OK && bCleanup==0 ){
3193 sqlite3BtreeLeave(p);
3194 return rc;
3195 }
3196 pBt->inTransaction = TRANS_READ;
3197 }
3198
3199 btreeEndTransaction(p);
3200 sqlite3BtreeLeave(p);
3201 return SQLITE_OK;
3202 }
3203
3204 /*
3205 ** Do both phases of a commit.
3206 */
sqlite3BtreeCommit(Btree * p)3207 int sqlite3BtreeCommit(Btree *p){
3208 int rc;
3209 sqlite3BtreeEnter(p);
3210 rc = sqlite3BtreeCommitPhaseOne(p, 0);
3211 if( rc==SQLITE_OK ){
3212 rc = sqlite3BtreeCommitPhaseTwo(p, 0);
3213 }
3214 sqlite3BtreeLeave(p);
3215 return rc;
3216 }
3217
3218 #ifndef NDEBUG
3219 /*
3220 ** Return the number of write-cursors open on this handle. This is for use
3221 ** in assert() expressions, so it is only compiled if NDEBUG is not
3222 ** defined.
3223 **
3224 ** For the purposes of this routine, a write-cursor is any cursor that
3225 ** is capable of writing to the databse. That means the cursor was
3226 ** originally opened for writing and the cursor has not be disabled
3227 ** by having its state changed to CURSOR_FAULT.
3228 */
countWriteCursors(BtShared * pBt)3229 static int countWriteCursors(BtShared *pBt){
3230 BtCursor *pCur;
3231 int r = 0;
3232 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3233 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
3234 }
3235 return r;
3236 }
3237 #endif
3238
3239 /*
3240 ** This routine sets the state to CURSOR_FAULT and the error
3241 ** code to errCode for every cursor on BtShared that pBtree
3242 ** references.
3243 **
3244 ** Every cursor is tripped, including cursors that belong
3245 ** to other database connections that happen to be sharing
3246 ** the cache with pBtree.
3247 **
3248 ** This routine gets called when a rollback occurs.
3249 ** All cursors using the same cache must be tripped
3250 ** to prevent them from trying to use the btree after
3251 ** the rollback. The rollback may have deleted tables
3252 ** or moved root pages, so it is not sufficient to
3253 ** save the state of the cursor. The cursor must be
3254 ** invalidated.
3255 */
sqlite3BtreeTripAllCursors(Btree * pBtree,int errCode)3256 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
3257 BtCursor *p;
3258 sqlite3BtreeEnter(pBtree);
3259 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
3260 int i;
3261 sqlite3BtreeClearCursor(p);
3262 p->eState = CURSOR_FAULT;
3263 p->skipNext = errCode;
3264 for(i=0; i<=p->iPage; i++){
3265 releasePage(p->apPage[i]);
3266 p->apPage[i] = 0;
3267 }
3268 }
3269 sqlite3BtreeLeave(pBtree);
3270 }
3271
3272 /*
3273 ** Rollback the transaction in progress. All cursors will be
3274 ** invalided by this operation. Any attempt to use a cursor
3275 ** that was open at the beginning of this operation will result
3276 ** in an error.
3277 **
3278 ** This will release the write lock on the database file. If there
3279 ** are no active cursors, it also releases the read lock.
3280 */
sqlite3BtreeRollback(Btree * p)3281 int sqlite3BtreeRollback(Btree *p){
3282 int rc;
3283 BtShared *pBt = p->pBt;
3284 MemPage *pPage1;
3285
3286 sqlite3BtreeEnter(p);
3287 rc = saveAllCursors(pBt, 0, 0);
3288 #ifndef SQLITE_OMIT_SHARED_CACHE
3289 if( rc!=SQLITE_OK ){
3290 /* This is a horrible situation. An IO or malloc() error occurred whilst
3291 ** trying to save cursor positions. If this is an automatic rollback (as
3292 ** the result of a constraint, malloc() failure or IO error) then
3293 ** the cache may be internally inconsistent (not contain valid trees) so
3294 ** we cannot simply return the error to the caller. Instead, abort
3295 ** all queries that may be using any of the cursors that failed to save.
3296 */
3297 sqlite3BtreeTripAllCursors(p, rc);
3298 }
3299 #endif
3300 btreeIntegrity(p);
3301
3302 if( p->inTrans==TRANS_WRITE ){
3303 int rc2;
3304
3305 assert( TRANS_WRITE==pBt->inTransaction );
3306 rc2 = sqlite3PagerRollback(pBt->pPager);
3307 if( rc2!=SQLITE_OK ){
3308 rc = rc2;
3309 }
3310
3311 /* The rollback may have destroyed the pPage1->aData value. So
3312 ** call btreeGetPage() on page 1 again to make
3313 ** sure pPage1->aData is set correctly. */
3314 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
3315 int nPage = get4byte(28+(u8*)pPage1->aData);
3316 testcase( nPage==0 );
3317 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
3318 testcase( pBt->nPage!=nPage );
3319 pBt->nPage = nPage;
3320 releasePage(pPage1);
3321 }
3322 assert( countWriteCursors(pBt)==0 );
3323 pBt->inTransaction = TRANS_READ;
3324 }
3325
3326 btreeEndTransaction(p);
3327 sqlite3BtreeLeave(p);
3328 return rc;
3329 }
3330
3331 /*
3332 ** Start a statement subtransaction. The subtransaction can can be rolled
3333 ** back independently of the main transaction. You must start a transaction
3334 ** before starting a subtransaction. The subtransaction is ended automatically
3335 ** if the main transaction commits or rolls back.
3336 **
3337 ** Statement subtransactions are used around individual SQL statements
3338 ** that are contained within a BEGIN...COMMIT block. If a constraint
3339 ** error occurs within the statement, the effect of that one statement
3340 ** can be rolled back without having to rollback the entire transaction.
3341 **
3342 ** A statement sub-transaction is implemented as an anonymous savepoint. The
3343 ** value passed as the second parameter is the total number of savepoints,
3344 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
3345 ** are no active savepoints and no other statement-transactions open,
3346 ** iStatement is 1. This anonymous savepoint can be released or rolled back
3347 ** using the sqlite3BtreeSavepoint() function.
3348 */
sqlite3BtreeBeginStmt(Btree * p,int iStatement)3349 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
3350 int rc;
3351 BtShared *pBt = p->pBt;
3352 sqlite3BtreeEnter(p);
3353 assert( p->inTrans==TRANS_WRITE );
3354 assert( pBt->readOnly==0 );
3355 assert( iStatement>0 );
3356 assert( iStatement>p->db->nSavepoint );
3357 assert( pBt->inTransaction==TRANS_WRITE );
3358 /* At the pager level, a statement transaction is a savepoint with
3359 ** an index greater than all savepoints created explicitly using
3360 ** SQL statements. It is illegal to open, release or rollback any
3361 ** such savepoints while the statement transaction savepoint is active.
3362 */
3363 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
3364 sqlite3BtreeLeave(p);
3365 return rc;
3366 }
3367
3368 /*
3369 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
3370 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
3371 ** savepoint identified by parameter iSavepoint, depending on the value
3372 ** of op.
3373 **
3374 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
3375 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
3376 ** contents of the entire transaction are rolled back. This is different
3377 ** from a normal transaction rollback, as no locks are released and the
3378 ** transaction remains open.
3379 */
sqlite3BtreeSavepoint(Btree * p,int op,int iSavepoint)3380 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
3381 int rc = SQLITE_OK;
3382 if( p && p->inTrans==TRANS_WRITE ){
3383 BtShared *pBt = p->pBt;
3384 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
3385 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
3386 sqlite3BtreeEnter(p);
3387 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
3388 if( rc==SQLITE_OK ){
3389 if( iSavepoint<0 && pBt->initiallyEmpty ) pBt->nPage = 0;
3390 rc = newDatabase(pBt);
3391 pBt->nPage = get4byte(28 + pBt->pPage1->aData);
3392
3393 /* The database size was written into the offset 28 of the header
3394 ** when the transaction started, so we know that the value at offset
3395 ** 28 is nonzero. */
3396 assert( pBt->nPage>0 );
3397 }
3398 sqlite3BtreeLeave(p);
3399 }
3400 return rc;
3401 }
3402
3403 /*
3404 ** Create a new cursor for the BTree whose root is on the page
3405 ** iTable. If a read-only cursor is requested, it is assumed that
3406 ** the caller already has at least a read-only transaction open
3407 ** on the database already. If a write-cursor is requested, then
3408 ** the caller is assumed to have an open write transaction.
3409 **
3410 ** If wrFlag==0, then the cursor can only be used for reading.
3411 ** If wrFlag==1, then the cursor can be used for reading or for
3412 ** writing if other conditions for writing are also met. These
3413 ** are the conditions that must be met in order for writing to
3414 ** be allowed:
3415 **
3416 ** 1: The cursor must have been opened with wrFlag==1
3417 **
3418 ** 2: Other database connections that share the same pager cache
3419 ** but which are not in the READ_UNCOMMITTED state may not have
3420 ** cursors open with wrFlag==0 on the same table. Otherwise
3421 ** the changes made by this write cursor would be visible to
3422 ** the read cursors in the other database connection.
3423 **
3424 ** 3: The database must be writable (not on read-only media)
3425 **
3426 ** 4: There must be an active transaction.
3427 **
3428 ** No checking is done to make sure that page iTable really is the
3429 ** root page of a b-tree. If it is not, then the cursor acquired
3430 ** will not work correctly.
3431 **
3432 ** It is assumed that the sqlite3BtreeCursorZero() has been called
3433 ** on pCur to initialize the memory space prior to invoking this routine.
3434 */
btreeCursor(Btree * p,int iTable,int wrFlag,struct KeyInfo * pKeyInfo,BtCursor * pCur)3435 static int btreeCursor(
3436 Btree *p, /* The btree */
3437 int iTable, /* Root page of table to open */
3438 int wrFlag, /* 1 to write. 0 read-only */
3439 struct KeyInfo *pKeyInfo, /* First arg to comparison function */
3440 BtCursor *pCur /* Space for new cursor */
3441 ){
3442 BtShared *pBt = p->pBt; /* Shared b-tree handle */
3443
3444 assert( sqlite3BtreeHoldsMutex(p) );
3445 assert( wrFlag==0 || wrFlag==1 );
3446
3447 /* The following assert statements verify that if this is a sharable
3448 ** b-tree database, the connection is holding the required table locks,
3449 ** and that no other connection has any open cursor that conflicts with
3450 ** this lock. */
3451 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) );
3452 assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
3453
3454 /* Assert that the caller has opened the required transaction. */
3455 assert( p->inTrans>TRANS_NONE );
3456 assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
3457 assert( pBt->pPage1 && pBt->pPage1->aData );
3458
3459 if( NEVER(wrFlag && pBt->readOnly) ){
3460 return SQLITE_READONLY;
3461 }
3462 if( iTable==1 && btreePagecount(pBt)==0 ){
3463 return SQLITE_EMPTY;
3464 }
3465
3466 /* Now that no other errors can occur, finish filling in the BtCursor
3467 ** variables and link the cursor into the BtShared list. */
3468 pCur->pgnoRoot = (Pgno)iTable;
3469 pCur->iPage = -1;
3470 pCur->pKeyInfo = pKeyInfo;
3471 pCur->pBtree = p;
3472 pCur->pBt = pBt;
3473 pCur->wrFlag = (u8)wrFlag;
3474 pCur->pNext = pBt->pCursor;
3475 if( pCur->pNext ){
3476 pCur->pNext->pPrev = pCur;
3477 }
3478 pBt->pCursor = pCur;
3479 pCur->eState = CURSOR_INVALID;
3480 pCur->cachedRowid = 0;
3481 return SQLITE_OK;
3482 }
sqlite3BtreeCursor(Btree * p,int iTable,int wrFlag,struct KeyInfo * pKeyInfo,BtCursor * pCur)3483 int sqlite3BtreeCursor(
3484 Btree *p, /* The btree */
3485 int iTable, /* Root page of table to open */
3486 int wrFlag, /* 1 to write. 0 read-only */
3487 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */
3488 BtCursor *pCur /* Write new cursor here */
3489 ){
3490 int rc;
3491 sqlite3BtreeEnter(p);
3492 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
3493 sqlite3BtreeLeave(p);
3494 return rc;
3495 }
3496
3497 /*
3498 ** Return the size of a BtCursor object in bytes.
3499 **
3500 ** This interfaces is needed so that users of cursors can preallocate
3501 ** sufficient storage to hold a cursor. The BtCursor object is opaque
3502 ** to users so they cannot do the sizeof() themselves - they must call
3503 ** this routine.
3504 */
sqlite3BtreeCursorSize(void)3505 int sqlite3BtreeCursorSize(void){
3506 return ROUND8(sizeof(BtCursor));
3507 }
3508
3509 /*
3510 ** Initialize memory that will be converted into a BtCursor object.
3511 **
3512 ** The simple approach here would be to memset() the entire object
3513 ** to zero. But it turns out that the apPage[] and aiIdx[] arrays
3514 ** do not need to be zeroed and they are large, so we can save a lot
3515 ** of run-time by skipping the initialization of those elements.
3516 */
sqlite3BtreeCursorZero(BtCursor * p)3517 void sqlite3BtreeCursorZero(BtCursor *p){
3518 memset(p, 0, offsetof(BtCursor, iPage));
3519 }
3520
3521 /*
3522 ** Set the cached rowid value of every cursor in the same database file
3523 ** as pCur and having the same root page number as pCur. The value is
3524 ** set to iRowid.
3525 **
3526 ** Only positive rowid values are considered valid for this cache.
3527 ** The cache is initialized to zero, indicating an invalid cache.
3528 ** A btree will work fine with zero or negative rowids. We just cannot
3529 ** cache zero or negative rowids, which means tables that use zero or
3530 ** negative rowids might run a little slower. But in practice, zero
3531 ** or negative rowids are very uncommon so this should not be a problem.
3532 */
sqlite3BtreeSetCachedRowid(BtCursor * pCur,sqlite3_int64 iRowid)3533 void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){
3534 BtCursor *p;
3535 for(p=pCur->pBt->pCursor; p; p=p->pNext){
3536 if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid;
3537 }
3538 assert( pCur->cachedRowid==iRowid );
3539 }
3540
3541 /*
3542 ** Return the cached rowid for the given cursor. A negative or zero
3543 ** return value indicates that the rowid cache is invalid and should be
3544 ** ignored. If the rowid cache has never before been set, then a
3545 ** zero is returned.
3546 */
sqlite3BtreeGetCachedRowid(BtCursor * pCur)3547 sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){
3548 return pCur->cachedRowid;
3549 }
3550
3551 /*
3552 ** Close a cursor. The read lock on the database file is released
3553 ** when the last cursor is closed.
3554 */
sqlite3BtreeCloseCursor(BtCursor * pCur)3555 int sqlite3BtreeCloseCursor(BtCursor *pCur){
3556 Btree *pBtree = pCur->pBtree;
3557 if( pBtree ){
3558 int i;
3559 BtShared *pBt = pCur->pBt;
3560 sqlite3BtreeEnter(pBtree);
3561 sqlite3BtreeClearCursor(pCur);
3562 if( pCur->pPrev ){
3563 pCur->pPrev->pNext = pCur->pNext;
3564 }else{
3565 pBt->pCursor = pCur->pNext;
3566 }
3567 if( pCur->pNext ){
3568 pCur->pNext->pPrev = pCur->pPrev;
3569 }
3570 for(i=0; i<=pCur->iPage; i++){
3571 releasePage(pCur->apPage[i]);
3572 }
3573 unlockBtreeIfUnused(pBt);
3574 invalidateOverflowCache(pCur);
3575 /* sqlite3_free(pCur); */
3576 sqlite3BtreeLeave(pBtree);
3577 }
3578 return SQLITE_OK;
3579 }
3580
3581 /*
3582 ** Make sure the BtCursor* given in the argument has a valid
3583 ** BtCursor.info structure. If it is not already valid, call
3584 ** btreeParseCell() to fill it in.
3585 **
3586 ** BtCursor.info is a cache of the information in the current cell.
3587 ** Using this cache reduces the number of calls to btreeParseCell().
3588 **
3589 ** 2007-06-25: There is a bug in some versions of MSVC that cause the
3590 ** compiler to crash when getCellInfo() is implemented as a macro.
3591 ** But there is a measureable speed advantage to using the macro on gcc
3592 ** (when less compiler optimizations like -Os or -O0 are used and the
3593 ** compiler is not doing agressive inlining.) So we use a real function
3594 ** for MSVC and a macro for everything else. Ticket #2457.
3595 */
3596 #ifndef NDEBUG
assertCellInfo(BtCursor * pCur)3597 static void assertCellInfo(BtCursor *pCur){
3598 CellInfo info;
3599 int iPage = pCur->iPage;
3600 memset(&info, 0, sizeof(info));
3601 btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
3602 assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
3603 }
3604 #else
3605 #define assertCellInfo(x)
3606 #endif
3607 #ifdef _MSC_VER
3608 /* Use a real function in MSVC to work around bugs in that compiler. */
getCellInfo(BtCursor * pCur)3609 static void getCellInfo(BtCursor *pCur){
3610 if( pCur->info.nSize==0 ){
3611 int iPage = pCur->iPage;
3612 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
3613 pCur->validNKey = 1;
3614 }else{
3615 assertCellInfo(pCur);
3616 }
3617 }
3618 #else /* if not _MSC_VER */
3619 /* Use a macro in all other compilers so that the function is inlined */
3620 #define getCellInfo(pCur) \
3621 if( pCur->info.nSize==0 ){ \
3622 int iPage = pCur->iPage; \
3623 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
3624 pCur->validNKey = 1; \
3625 }else{ \
3626 assertCellInfo(pCur); \
3627 }
3628 #endif /* _MSC_VER */
3629
3630 #ifndef NDEBUG /* The next routine used only within assert() statements */
3631 /*
3632 ** Return true if the given BtCursor is valid. A valid cursor is one
3633 ** that is currently pointing to a row in a (non-empty) table.
3634 ** This is a verification routine is used only within assert() statements.
3635 */
sqlite3BtreeCursorIsValid(BtCursor * pCur)3636 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
3637 return pCur && pCur->eState==CURSOR_VALID;
3638 }
3639 #endif /* NDEBUG */
3640
3641 /*
3642 ** Set *pSize to the size of the buffer needed to hold the value of
3643 ** the key for the current entry. If the cursor is not pointing
3644 ** to a valid entry, *pSize is set to 0.
3645 **
3646 ** For a table with the INTKEY flag set, this routine returns the key
3647 ** itself, not the number of bytes in the key.
3648 **
3649 ** The caller must position the cursor prior to invoking this routine.
3650 **
3651 ** This routine cannot fail. It always returns SQLITE_OK.
3652 */
sqlite3BtreeKeySize(BtCursor * pCur,i64 * pSize)3653 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
3654 assert( cursorHoldsMutex(pCur) );
3655 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3656 if( pCur->eState!=CURSOR_VALID ){
3657 *pSize = 0;
3658 }else{
3659 getCellInfo(pCur);
3660 *pSize = pCur->info.nKey;
3661 }
3662 return SQLITE_OK;
3663 }
3664
3665 /*
3666 ** Set *pSize to the number of bytes of data in the entry the
3667 ** cursor currently points to.
3668 **
3669 ** The caller must guarantee that the cursor is pointing to a non-NULL
3670 ** valid entry. In other words, the calling procedure must guarantee
3671 ** that the cursor has Cursor.eState==CURSOR_VALID.
3672 **
3673 ** Failure is not possible. This function always returns SQLITE_OK.
3674 ** It might just as well be a procedure (returning void) but we continue
3675 ** to return an integer result code for historical reasons.
3676 */
sqlite3BtreeDataSize(BtCursor * pCur,u32 * pSize)3677 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
3678 assert( cursorHoldsMutex(pCur) );
3679 assert( pCur->eState==CURSOR_VALID );
3680 getCellInfo(pCur);
3681 *pSize = pCur->info.nData;
3682 return SQLITE_OK;
3683 }
3684
3685 /*
3686 ** Given the page number of an overflow page in the database (parameter
3687 ** ovfl), this function finds the page number of the next page in the
3688 ** linked list of overflow pages. If possible, it uses the auto-vacuum
3689 ** pointer-map data instead of reading the content of page ovfl to do so.
3690 **
3691 ** If an error occurs an SQLite error code is returned. Otherwise:
3692 **
3693 ** The page number of the next overflow page in the linked list is
3694 ** written to *pPgnoNext. If page ovfl is the last page in its linked
3695 ** list, *pPgnoNext is set to zero.
3696 **
3697 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
3698 ** to page number pOvfl was obtained, then *ppPage is set to point to that
3699 ** reference. It is the responsibility of the caller to call releasePage()
3700 ** on *ppPage to free the reference. In no reference was obtained (because
3701 ** the pointer-map was used to obtain the value for *pPgnoNext), then
3702 ** *ppPage is set to zero.
3703 */
getOverflowPage(BtShared * pBt,Pgno ovfl,MemPage ** ppPage,Pgno * pPgnoNext)3704 static int getOverflowPage(
3705 BtShared *pBt, /* The database file */
3706 Pgno ovfl, /* Current overflow page number */
3707 MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */
3708 Pgno *pPgnoNext /* OUT: Next overflow page number */
3709 ){
3710 Pgno next = 0;
3711 MemPage *pPage = 0;
3712 int rc = SQLITE_OK;
3713
3714 assert( sqlite3_mutex_held(pBt->mutex) );
3715 assert(pPgnoNext);
3716
3717 #ifndef SQLITE_OMIT_AUTOVACUUM
3718 /* Try to find the next page in the overflow list using the
3719 ** autovacuum pointer-map pages. Guess that the next page in
3720 ** the overflow list is page number (ovfl+1). If that guess turns
3721 ** out to be wrong, fall back to loading the data of page
3722 ** number ovfl to determine the next page number.
3723 */
3724 if( pBt->autoVacuum ){
3725 Pgno pgno;
3726 Pgno iGuess = ovfl+1;
3727 u8 eType;
3728
3729 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3730 iGuess++;
3731 }
3732
3733 if( iGuess<=btreePagecount(pBt) ){
3734 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
3735 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
3736 next = iGuess;
3737 rc = SQLITE_DONE;
3738 }
3739 }
3740 }
3741 #endif
3742
3743 assert( next==0 || rc==SQLITE_DONE );
3744 if( rc==SQLITE_OK ){
3745 rc = btreeGetPage(pBt, ovfl, &pPage, 0);
3746 assert( rc==SQLITE_OK || pPage==0 );
3747 if( rc==SQLITE_OK ){
3748 next = get4byte(pPage->aData);
3749 }
3750 }
3751
3752 *pPgnoNext = next;
3753 if( ppPage ){
3754 *ppPage = pPage;
3755 }else{
3756 releasePage(pPage);
3757 }
3758 return (rc==SQLITE_DONE ? SQLITE_OK : rc);
3759 }
3760
3761 /*
3762 ** Copy data from a buffer to a page, or from a page to a buffer.
3763 **
3764 ** pPayload is a pointer to data stored on database page pDbPage.
3765 ** If argument eOp is false, then nByte bytes of data are copied
3766 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3767 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3768 ** of data are copied from the buffer pBuf to pPayload.
3769 **
3770 ** SQLITE_OK is returned on success, otherwise an error code.
3771 */
copyPayload(void * pPayload,void * pBuf,int nByte,int eOp,DbPage * pDbPage)3772 static int copyPayload(
3773 void *pPayload, /* Pointer to page data */
3774 void *pBuf, /* Pointer to buffer */
3775 int nByte, /* Number of bytes to copy */
3776 int eOp, /* 0 -> copy from page, 1 -> copy to page */
3777 DbPage *pDbPage /* Page containing pPayload */
3778 ){
3779 if( eOp ){
3780 /* Copy data from buffer to page (a write operation) */
3781 int rc = sqlite3PagerWrite(pDbPage);
3782 if( rc!=SQLITE_OK ){
3783 return rc;
3784 }
3785 memcpy(pPayload, pBuf, nByte);
3786 }else{
3787 /* Copy data from page to buffer (a read operation) */
3788 memcpy(pBuf, pPayload, nByte);
3789 }
3790 return SQLITE_OK;
3791 }
3792
3793 /*
3794 ** This function is used to read or overwrite payload information
3795 ** for the entry that the pCur cursor is pointing to. If the eOp
3796 ** parameter is 0, this is a read operation (data copied into
3797 ** buffer pBuf). If it is non-zero, a write (data copied from
3798 ** buffer pBuf).
3799 **
3800 ** A total of "amt" bytes are read or written beginning at "offset".
3801 ** Data is read to or from the buffer pBuf.
3802 **
3803 ** The content being read or written might appear on the main page
3804 ** or be scattered out on multiple overflow pages.
3805 **
3806 ** If the BtCursor.isIncrblobHandle flag is set, and the current
3807 ** cursor entry uses one or more overflow pages, this function
3808 ** allocates space for and lazily popluates the overflow page-list
3809 ** cache array (BtCursor.aOverflow). Subsequent calls use this
3810 ** cache to make seeking to the supplied offset more efficient.
3811 **
3812 ** Once an overflow page-list cache has been allocated, it may be
3813 ** invalidated if some other cursor writes to the same table, or if
3814 ** the cursor is moved to a different row. Additionally, in auto-vacuum
3815 ** mode, the following events may invalidate an overflow page-list cache.
3816 **
3817 ** * An incremental vacuum,
3818 ** * A commit in auto_vacuum="full" mode,
3819 ** * Creating a table (may require moving an overflow page).
3820 */
accessPayload(BtCursor * pCur,u32 offset,u32 amt,unsigned char * pBuf,int eOp)3821 static int accessPayload(
3822 BtCursor *pCur, /* Cursor pointing to entry to read from */
3823 u32 offset, /* Begin reading this far into payload */
3824 u32 amt, /* Read this many bytes */
3825 unsigned char *pBuf, /* Write the bytes into this buffer */
3826 int eOp /* zero to read. non-zero to write. */
3827 ){
3828 unsigned char *aPayload;
3829 int rc = SQLITE_OK;
3830 u32 nKey;
3831 int iIdx = 0;
3832 MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
3833 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */
3834
3835 assert( pPage );
3836 assert( pCur->eState==CURSOR_VALID );
3837 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3838 assert( cursorHoldsMutex(pCur) );
3839
3840 getCellInfo(pCur);
3841 aPayload = pCur->info.pCell + pCur->info.nHeader;
3842 nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey);
3843
3844 if( NEVER(offset+amt > nKey+pCur->info.nData)
3845 || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
3846 ){
3847 /* Trying to read or write past the end of the data is an error */
3848 return SQLITE_CORRUPT_BKPT;
3849 }
3850
3851 /* Check if data must be read/written to/from the btree page itself. */
3852 if( offset<pCur->info.nLocal ){
3853 int a = amt;
3854 if( a+offset>pCur->info.nLocal ){
3855 a = pCur->info.nLocal - offset;
3856 }
3857 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
3858 offset = 0;
3859 pBuf += a;
3860 amt -= a;
3861 }else{
3862 offset -= pCur->info.nLocal;
3863 }
3864
3865 if( rc==SQLITE_OK && amt>0 ){
3866 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
3867 Pgno nextPage;
3868
3869 nextPage = get4byte(&aPayload[pCur->info.nLocal]);
3870
3871 #ifndef SQLITE_OMIT_INCRBLOB
3872 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
3873 ** has not been allocated, allocate it now. The array is sized at
3874 ** one entry for each overflow page in the overflow chain. The
3875 ** page number of the first overflow page is stored in aOverflow[0],
3876 ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3877 ** (the cache is lazily populated).
3878 */
3879 if( pCur->isIncrblobHandle && !pCur->aOverflow ){
3880 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
3881 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
3882 /* nOvfl is always positive. If it were zero, fetchPayload would have
3883 ** been used instead of this routine. */
3884 if( ALWAYS(nOvfl) && !pCur->aOverflow ){
3885 rc = SQLITE_NOMEM;
3886 }
3887 }
3888
3889 /* If the overflow page-list cache has been allocated and the
3890 ** entry for the first required overflow page is valid, skip
3891 ** directly to it.
3892 */
3893 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3894 iIdx = (offset/ovflSize);
3895 nextPage = pCur->aOverflow[iIdx];
3896 offset = (offset%ovflSize);
3897 }
3898 #endif
3899
3900 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3901
3902 #ifndef SQLITE_OMIT_INCRBLOB
3903 /* If required, populate the overflow page-list cache. */
3904 if( pCur->aOverflow ){
3905 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3906 pCur->aOverflow[iIdx] = nextPage;
3907 }
3908 #endif
3909
3910 if( offset>=ovflSize ){
3911 /* The only reason to read this page is to obtain the page
3912 ** number for the next page in the overflow chain. The page
3913 ** data is not required. So first try to lookup the overflow
3914 ** page-list cache, if any, then fall back to the getOverflowPage()
3915 ** function.
3916 */
3917 #ifndef SQLITE_OMIT_INCRBLOB
3918 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3919 nextPage = pCur->aOverflow[iIdx+1];
3920 } else
3921 #endif
3922 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
3923 offset -= ovflSize;
3924 }else{
3925 /* Need to read this page properly. It contains some of the
3926 ** range of data that is being read (eOp==0) or written (eOp!=0).
3927 */
3928 DbPage *pDbPage;
3929 int a = amt;
3930 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
3931 if( rc==SQLITE_OK ){
3932 aPayload = sqlite3PagerGetData(pDbPage);
3933 nextPage = get4byte(aPayload);
3934 if( a + offset > ovflSize ){
3935 a = ovflSize - offset;
3936 }
3937 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3938 sqlite3PagerUnref(pDbPage);
3939 offset = 0;
3940 amt -= a;
3941 pBuf += a;
3942 }
3943 }
3944 }
3945 }
3946
3947 if( rc==SQLITE_OK && amt>0 ){
3948 return SQLITE_CORRUPT_BKPT;
3949 }
3950 return rc;
3951 }
3952
3953 /*
3954 ** Read part of the key associated with cursor pCur. Exactly
3955 ** "amt" bytes will be transfered into pBuf[]. The transfer
3956 ** begins at "offset".
3957 **
3958 ** The caller must ensure that pCur is pointing to a valid row
3959 ** in the table.
3960 **
3961 ** Return SQLITE_OK on success or an error code if anything goes
3962 ** wrong. An error is returned if "offset+amt" is larger than
3963 ** the available payload.
3964 */
sqlite3BtreeKey(BtCursor * pCur,u32 offset,u32 amt,void * pBuf)3965 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3966 assert( cursorHoldsMutex(pCur) );
3967 assert( pCur->eState==CURSOR_VALID );
3968 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3969 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3970 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
3971 }
3972
3973 /*
3974 ** Read part of the data associated with cursor pCur. Exactly
3975 ** "amt" bytes will be transfered into pBuf[]. The transfer
3976 ** begins at "offset".
3977 **
3978 ** Return SQLITE_OK on success or an error code if anything goes
3979 ** wrong. An error is returned if "offset+amt" is larger than
3980 ** the available payload.
3981 */
sqlite3BtreeData(BtCursor * pCur,u32 offset,u32 amt,void * pBuf)3982 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3983 int rc;
3984
3985 #ifndef SQLITE_OMIT_INCRBLOB
3986 if ( pCur->eState==CURSOR_INVALID ){
3987 return SQLITE_ABORT;
3988 }
3989 #endif
3990
3991 assert( cursorHoldsMutex(pCur) );
3992 rc = restoreCursorPosition(pCur);
3993 if( rc==SQLITE_OK ){
3994 assert( pCur->eState==CURSOR_VALID );
3995 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3996 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3997 rc = accessPayload(pCur, offset, amt, pBuf, 0);
3998 }
3999 return rc;
4000 }
4001
4002 /*
4003 ** Return a pointer to payload information from the entry that the
4004 ** pCur cursor is pointing to. The pointer is to the beginning of
4005 ** the key if skipKey==0 and it points to the beginning of data if
4006 ** skipKey==1. The number of bytes of available key/data is written
4007 ** into *pAmt. If *pAmt==0, then the value returned will not be
4008 ** a valid pointer.
4009 **
4010 ** This routine is an optimization. It is common for the entire key
4011 ** and data to fit on the local page and for there to be no overflow
4012 ** pages. When that is so, this routine can be used to access the
4013 ** key and data without making a copy. If the key and/or data spills
4014 ** onto overflow pages, then accessPayload() must be used to reassemble
4015 ** the key/data and copy it into a preallocated buffer.
4016 **
4017 ** The pointer returned by this routine looks directly into the cached
4018 ** page of the database. The data might change or move the next time
4019 ** any btree routine is called.
4020 */
fetchPayload(BtCursor * pCur,int * pAmt,int skipKey)4021 static const unsigned char *fetchPayload(
4022 BtCursor *pCur, /* Cursor pointing to entry to read from */
4023 int *pAmt, /* Write the number of available bytes here */
4024 int skipKey /* read beginning at data if this is true */
4025 ){
4026 unsigned char *aPayload;
4027 MemPage *pPage;
4028 u32 nKey;
4029 u32 nLocal;
4030
4031 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
4032 assert( pCur->eState==CURSOR_VALID );
4033 assert( cursorHoldsMutex(pCur) );
4034 pPage = pCur->apPage[pCur->iPage];
4035 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4036 if( NEVER(pCur->info.nSize==0) ){
4037 btreeParseCell(pCur->apPage[pCur->iPage], pCur->aiIdx[pCur->iPage],
4038 &pCur->info);
4039 }
4040 aPayload = pCur->info.pCell;
4041 aPayload += pCur->info.nHeader;
4042 if( pPage->intKey ){
4043 nKey = 0;
4044 }else{
4045 nKey = (int)pCur->info.nKey;
4046 }
4047 if( skipKey ){
4048 aPayload += nKey;
4049 nLocal = pCur->info.nLocal - nKey;
4050 }else{
4051 nLocal = pCur->info.nLocal;
4052 assert( nLocal<=nKey );
4053 }
4054 *pAmt = nLocal;
4055 return aPayload;
4056 }
4057
4058
4059 /*
4060 ** For the entry that cursor pCur is point to, return as
4061 ** many bytes of the key or data as are available on the local
4062 ** b-tree page. Write the number of available bytes into *pAmt.
4063 **
4064 ** The pointer returned is ephemeral. The key/data may move
4065 ** or be destroyed on the next call to any Btree routine,
4066 ** including calls from other threads against the same cache.
4067 ** Hence, a mutex on the BtShared should be held prior to calling
4068 ** this routine.
4069 **
4070 ** These routines is used to get quick access to key and data
4071 ** in the common case where no overflow pages are used.
4072 */
sqlite3BtreeKeyFetch(BtCursor * pCur,int * pAmt)4073 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
4074 const void *p = 0;
4075 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4076 assert( cursorHoldsMutex(pCur) );
4077 if( ALWAYS(pCur->eState==CURSOR_VALID) ){
4078 p = (const void*)fetchPayload(pCur, pAmt, 0);
4079 }
4080 return p;
4081 }
sqlite3BtreeDataFetch(BtCursor * pCur,int * pAmt)4082 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
4083 const void *p = 0;
4084 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4085 assert( cursorHoldsMutex(pCur) );
4086 if( ALWAYS(pCur->eState==CURSOR_VALID) ){
4087 p = (const void*)fetchPayload(pCur, pAmt, 1);
4088 }
4089 return p;
4090 }
4091
4092
4093 /*
4094 ** Move the cursor down to a new child page. The newPgno argument is the
4095 ** page number of the child page to move to.
4096 **
4097 ** This function returns SQLITE_CORRUPT if the page-header flags field of
4098 ** the new child page does not match the flags field of the parent (i.e.
4099 ** if an intkey page appears to be the parent of a non-intkey page, or
4100 ** vice-versa).
4101 */
moveToChild(BtCursor * pCur,u32 newPgno)4102 static int moveToChild(BtCursor *pCur, u32 newPgno){
4103 int rc;
4104 int i = pCur->iPage;
4105 MemPage *pNewPage;
4106 BtShared *pBt = pCur->pBt;
4107
4108 assert( cursorHoldsMutex(pCur) );
4109 assert( pCur->eState==CURSOR_VALID );
4110 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
4111 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
4112 return SQLITE_CORRUPT_BKPT;
4113 }
4114 rc = getAndInitPage(pBt, newPgno, &pNewPage);
4115 if( rc ) return rc;
4116 pCur->apPage[i+1] = pNewPage;
4117 pCur->aiIdx[i+1] = 0;
4118 pCur->iPage++;
4119
4120 pCur->info.nSize = 0;
4121 pCur->validNKey = 0;
4122 if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){
4123 return SQLITE_CORRUPT_BKPT;
4124 }
4125 return SQLITE_OK;
4126 }
4127
4128 #ifndef NDEBUG
4129 /*
4130 ** Page pParent is an internal (non-leaf) tree page. This function
4131 ** asserts that page number iChild is the left-child if the iIdx'th
4132 ** cell in page pParent. Or, if iIdx is equal to the total number of
4133 ** cells in pParent, that page number iChild is the right-child of
4134 ** the page.
4135 */
assertParentIndex(MemPage * pParent,int iIdx,Pgno iChild)4136 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
4137 assert( iIdx<=pParent->nCell );
4138 if( iIdx==pParent->nCell ){
4139 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
4140 }else{
4141 assert( get4byte(findCell(pParent, iIdx))==iChild );
4142 }
4143 }
4144 #else
4145 # define assertParentIndex(x,y,z)
4146 #endif
4147
4148 /*
4149 ** Move the cursor up to the parent page.
4150 **
4151 ** pCur->idx is set to the cell index that contains the pointer
4152 ** to the page we are coming from. If we are coming from the
4153 ** right-most child page then pCur->idx is set to one more than
4154 ** the largest cell index.
4155 */
moveToParent(BtCursor * pCur)4156 static void moveToParent(BtCursor *pCur){
4157 assert( cursorHoldsMutex(pCur) );
4158 assert( pCur->eState==CURSOR_VALID );
4159 assert( pCur->iPage>0 );
4160 assert( pCur->apPage[pCur->iPage] );
4161 assertParentIndex(
4162 pCur->apPage[pCur->iPage-1],
4163 pCur->aiIdx[pCur->iPage-1],
4164 pCur->apPage[pCur->iPage]->pgno
4165 );
4166 releasePage(pCur->apPage[pCur->iPage]);
4167 pCur->iPage--;
4168 pCur->info.nSize = 0;
4169 pCur->validNKey = 0;
4170 }
4171
4172 /*
4173 ** Move the cursor to point to the root page of its b-tree structure.
4174 **
4175 ** If the table has a virtual root page, then the cursor is moved to point
4176 ** to the virtual root page instead of the actual root page. A table has a
4177 ** virtual root page when the actual root page contains no cells and a
4178 ** single child page. This can only happen with the table rooted at page 1.
4179 **
4180 ** If the b-tree structure is empty, the cursor state is set to
4181 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first
4182 ** cell located on the root (or virtual root) page and the cursor state
4183 ** is set to CURSOR_VALID.
4184 **
4185 ** If this function returns successfully, it may be assumed that the
4186 ** page-header flags indicate that the [virtual] root-page is the expected
4187 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
4188 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
4189 ** indicating a table b-tree, or if the caller did specify a KeyInfo
4190 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
4191 ** b-tree).
4192 */
moveToRoot(BtCursor * pCur)4193 static int moveToRoot(BtCursor *pCur){
4194 MemPage *pRoot;
4195 int rc = SQLITE_OK;
4196 Btree *p = pCur->pBtree;
4197 BtShared *pBt = p->pBt;
4198
4199 assert( cursorHoldsMutex(pCur) );
4200 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
4201 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
4202 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
4203 if( pCur->eState>=CURSOR_REQUIRESEEK ){
4204 if( pCur->eState==CURSOR_FAULT ){
4205 assert( pCur->skipNext!=SQLITE_OK );
4206 return pCur->skipNext;
4207 }
4208 sqlite3BtreeClearCursor(pCur);
4209 }
4210
4211 if( pCur->iPage>=0 ){
4212 int i;
4213 for(i=1; i<=pCur->iPage; i++){
4214 releasePage(pCur->apPage[i]);
4215 }
4216 pCur->iPage = 0;
4217 }else{
4218 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
4219 if( rc!=SQLITE_OK ){
4220 pCur->eState = CURSOR_INVALID;
4221 return rc;
4222 }
4223 pCur->iPage = 0;
4224
4225 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
4226 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
4227 ** NULL, the caller expects a table b-tree. If this is not the case,
4228 ** return an SQLITE_CORRUPT error. */
4229 assert( pCur->apPage[0]->intKey==1 || pCur->apPage[0]->intKey==0 );
4230 if( (pCur->pKeyInfo==0)!=pCur->apPage[0]->intKey ){
4231 return SQLITE_CORRUPT_BKPT;
4232 }
4233 }
4234
4235 /* Assert that the root page is of the correct type. This must be the
4236 ** case as the call to this function that loaded the root-page (either
4237 ** this call or a previous invocation) would have detected corruption
4238 ** if the assumption were not true, and it is not possible for the flags
4239 ** byte to have been modified while this cursor is holding a reference
4240 ** to the page. */
4241 pRoot = pCur->apPage[0];
4242 assert( pRoot->pgno==pCur->pgnoRoot );
4243 assert( pRoot->isInit && (pCur->pKeyInfo==0)==pRoot->intKey );
4244
4245 pCur->aiIdx[0] = 0;
4246 pCur->info.nSize = 0;
4247 pCur->atLast = 0;
4248 pCur->validNKey = 0;
4249
4250 if( pRoot->nCell==0 && !pRoot->leaf ){
4251 Pgno subpage;
4252 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
4253 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
4254 pCur->eState = CURSOR_VALID;
4255 rc = moveToChild(pCur, subpage);
4256 }else{
4257 pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
4258 }
4259 return rc;
4260 }
4261
4262 /*
4263 ** Move the cursor down to the left-most leaf entry beneath the
4264 ** entry to which it is currently pointing.
4265 **
4266 ** The left-most leaf is the one with the smallest key - the first
4267 ** in ascending order.
4268 */
moveToLeftmost(BtCursor * pCur)4269 static int moveToLeftmost(BtCursor *pCur){
4270 Pgno pgno;
4271 int rc = SQLITE_OK;
4272 MemPage *pPage;
4273
4274 assert( cursorHoldsMutex(pCur) );
4275 assert( pCur->eState==CURSOR_VALID );
4276 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4277 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4278 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
4279 rc = moveToChild(pCur, pgno);
4280 }
4281 return rc;
4282 }
4283
4284 /*
4285 ** Move the cursor down to the right-most leaf entry beneath the
4286 ** page to which it is currently pointing. Notice the difference
4287 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
4288 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
4289 ** finds the right-most entry beneath the *page*.
4290 **
4291 ** The right-most entry is the one with the largest key - the last
4292 ** key in ascending order.
4293 */
moveToRightmost(BtCursor * pCur)4294 static int moveToRightmost(BtCursor *pCur){
4295 Pgno pgno;
4296 int rc = SQLITE_OK;
4297 MemPage *pPage = 0;
4298
4299 assert( cursorHoldsMutex(pCur) );
4300 assert( pCur->eState==CURSOR_VALID );
4301 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4302 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4303 pCur->aiIdx[pCur->iPage] = pPage->nCell;
4304 rc = moveToChild(pCur, pgno);
4305 }
4306 if( rc==SQLITE_OK ){
4307 pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
4308 pCur->info.nSize = 0;
4309 pCur->validNKey = 0;
4310 }
4311 return rc;
4312 }
4313
4314 /* Move the cursor to the first entry in the table. Return SQLITE_OK
4315 ** on success. Set *pRes to 0 if the cursor actually points to something
4316 ** or set *pRes to 1 if the table is empty.
4317 */
sqlite3BtreeFirst(BtCursor * pCur,int * pRes)4318 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
4319 int rc;
4320
4321 assert( cursorHoldsMutex(pCur) );
4322 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4323 rc = moveToRoot(pCur);
4324 if( rc==SQLITE_OK ){
4325 if( pCur->eState==CURSOR_INVALID ){
4326 assert( pCur->apPage[pCur->iPage]->nCell==0 );
4327 *pRes = 1;
4328 }else{
4329 assert( pCur->apPage[pCur->iPage]->nCell>0 );
4330 *pRes = 0;
4331 rc = moveToLeftmost(pCur);
4332 }
4333 }
4334 return rc;
4335 }
4336
4337 /* Move the cursor to the last entry in the table. Return SQLITE_OK
4338 ** on success. Set *pRes to 0 if the cursor actually points to something
4339 ** or set *pRes to 1 if the table is empty.
4340 */
sqlite3BtreeLast(BtCursor * pCur,int * pRes)4341 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
4342 int rc;
4343
4344 assert( cursorHoldsMutex(pCur) );
4345 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4346
4347 /* If the cursor already points to the last entry, this is a no-op. */
4348 if( CURSOR_VALID==pCur->eState && pCur->atLast ){
4349 #ifdef SQLITE_DEBUG
4350 /* This block serves to assert() that the cursor really does point
4351 ** to the last entry in the b-tree. */
4352 int ii;
4353 for(ii=0; ii<pCur->iPage; ii++){
4354 assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
4355 }
4356 assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );
4357 assert( pCur->apPage[pCur->iPage]->leaf );
4358 #endif
4359 return SQLITE_OK;
4360 }
4361
4362 rc = moveToRoot(pCur);
4363 if( rc==SQLITE_OK ){
4364 if( CURSOR_INVALID==pCur->eState ){
4365 assert( pCur->apPage[pCur->iPage]->nCell==0 );
4366 *pRes = 1;
4367 }else{
4368 assert( pCur->eState==CURSOR_VALID );
4369 *pRes = 0;
4370 rc = moveToRightmost(pCur);
4371 pCur->atLast = rc==SQLITE_OK ?1:0;
4372 }
4373 }
4374 return rc;
4375 }
4376
4377 /* Move the cursor so that it points to an entry near the key
4378 ** specified by pIdxKey or intKey. Return a success code.
4379 **
4380 ** For INTKEY tables, the intKey parameter is used. pIdxKey
4381 ** must be NULL. For index tables, pIdxKey is used and intKey
4382 ** is ignored.
4383 **
4384 ** If an exact match is not found, then the cursor is always
4385 ** left pointing at a leaf page which would hold the entry if it
4386 ** were present. The cursor might point to an entry that comes
4387 ** before or after the key.
4388 **
4389 ** An integer is written into *pRes which is the result of
4390 ** comparing the key with the entry to which the cursor is
4391 ** pointing. The meaning of the integer written into
4392 ** *pRes is as follows:
4393 **
4394 ** *pRes<0 The cursor is left pointing at an entry that
4395 ** is smaller than intKey/pIdxKey or if the table is empty
4396 ** and the cursor is therefore left point to nothing.
4397 **
4398 ** *pRes==0 The cursor is left pointing at an entry that
4399 ** exactly matches intKey/pIdxKey.
4400 **
4401 ** *pRes>0 The cursor is left pointing at an entry that
4402 ** is larger than intKey/pIdxKey.
4403 **
4404 */
sqlite3BtreeMovetoUnpacked(BtCursor * pCur,UnpackedRecord * pIdxKey,i64 intKey,int biasRight,int * pRes)4405 int sqlite3BtreeMovetoUnpacked(
4406 BtCursor *pCur, /* The cursor to be moved */
4407 UnpackedRecord *pIdxKey, /* Unpacked index key */
4408 i64 intKey, /* The table key */
4409 int biasRight, /* If true, bias the search to the high end */
4410 int *pRes /* Write search results here */
4411 ){
4412 int rc;
4413
4414 assert( cursorHoldsMutex(pCur) );
4415 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4416 assert( pRes );
4417 assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
4418
4419 /* If the cursor is already positioned at the point we are trying
4420 ** to move to, then just return without doing any work */
4421 if( pCur->eState==CURSOR_VALID && pCur->validNKey
4422 && pCur->apPage[0]->intKey
4423 ){
4424 if( pCur->info.nKey==intKey ){
4425 *pRes = 0;
4426 return SQLITE_OK;
4427 }
4428 if( pCur->atLast && pCur->info.nKey<intKey ){
4429 *pRes = -1;
4430 return SQLITE_OK;
4431 }
4432 }
4433
4434 rc = moveToRoot(pCur);
4435 if( rc ){
4436 return rc;
4437 }
4438 assert( pCur->apPage[pCur->iPage] );
4439 assert( pCur->apPage[pCur->iPage]->isInit );
4440 assert( pCur->apPage[pCur->iPage]->nCell>0 || pCur->eState==CURSOR_INVALID );
4441 if( pCur->eState==CURSOR_INVALID ){
4442 *pRes = -1;
4443 assert( pCur->apPage[pCur->iPage]->nCell==0 );
4444 return SQLITE_OK;
4445 }
4446 assert( pCur->apPage[0]->intKey || pIdxKey );
4447 for(;;){
4448 int lwr, upr;
4449 Pgno chldPg;
4450 MemPage *pPage = pCur->apPage[pCur->iPage];
4451 int c;
4452
4453 /* pPage->nCell must be greater than zero. If this is the root-page
4454 ** the cursor would have been INVALID above and this for(;;) loop
4455 ** not run. If this is not the root-page, then the moveToChild() routine
4456 ** would have already detected db corruption. Similarly, pPage must
4457 ** be the right kind (index or table) of b-tree page. Otherwise
4458 ** a moveToChild() or moveToRoot() call would have detected corruption. */
4459 assert( pPage->nCell>0 );
4460 assert( pPage->intKey==(pIdxKey==0) );
4461 lwr = 0;
4462 upr = pPage->nCell-1;
4463 if( biasRight ){
4464 pCur->aiIdx[pCur->iPage] = (u16)upr;
4465 }else{
4466 pCur->aiIdx[pCur->iPage] = (u16)((upr+lwr)/2);
4467 }
4468 for(;;){
4469 int idx = pCur->aiIdx[pCur->iPage]; /* Index of current cell in pPage */
4470 u8 *pCell; /* Pointer to current cell in pPage */
4471
4472 pCur->info.nSize = 0;
4473 pCell = findCell(pPage, idx) + pPage->childPtrSize;
4474 if( pPage->intKey ){
4475 i64 nCellKey;
4476 if( pPage->hasData ){
4477 u32 dummy;
4478 pCell += getVarint32(pCell, dummy);
4479 }
4480 getVarint(pCell, (u64*)&nCellKey);
4481 if( nCellKey==intKey ){
4482 c = 0;
4483 }else if( nCellKey<intKey ){
4484 c = -1;
4485 }else{
4486 assert( nCellKey>intKey );
4487 c = +1;
4488 }
4489 pCur->validNKey = 1;
4490 pCur->info.nKey = nCellKey;
4491 }else{
4492 /* The maximum supported page-size is 65536 bytes. This means that
4493 ** the maximum number of record bytes stored on an index B-Tree
4494 ** page is less than 16384 bytes and may be stored as a 2-byte
4495 ** varint. This information is used to attempt to avoid parsing
4496 ** the entire cell by checking for the cases where the record is
4497 ** stored entirely within the b-tree page by inspecting the first
4498 ** 2 bytes of the cell.
4499 */
4500 int nCell = pCell[0];
4501 if( !(nCell & 0x80) && nCell<=pPage->maxLocal ){
4502 /* This branch runs if the record-size field of the cell is a
4503 ** single byte varint and the record fits entirely on the main
4504 ** b-tree page. */
4505 c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
4506 }else if( !(pCell[1] & 0x80)
4507 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
4508 ){
4509 /* The record-size field is a 2 byte varint and the record
4510 ** fits entirely on the main b-tree page. */
4511 c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
4512 }else{
4513 /* The record flows over onto one or more overflow pages. In
4514 ** this case the whole cell needs to be parsed, a buffer allocated
4515 ** and accessPayload() used to retrieve the record into the
4516 ** buffer before VdbeRecordCompare() can be called. */
4517 void *pCellKey;
4518 u8 * const pCellBody = pCell - pPage->childPtrSize;
4519 btreeParseCellPtr(pPage, pCellBody, &pCur->info);
4520 nCell = (int)pCur->info.nKey;
4521 pCellKey = sqlite3Malloc( nCell );
4522 if( pCellKey==0 ){
4523 rc = SQLITE_NOMEM;
4524 goto moveto_finish;
4525 }
4526 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
4527 if( rc ){
4528 sqlite3_free(pCellKey);
4529 goto moveto_finish;
4530 }
4531 c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
4532 sqlite3_free(pCellKey);
4533 }
4534 }
4535 if( c==0 ){
4536 if( pPage->intKey && !pPage->leaf ){
4537 lwr = idx;
4538 upr = lwr - 1;
4539 break;
4540 }else{
4541 *pRes = 0;
4542 rc = SQLITE_OK;
4543 goto moveto_finish;
4544 }
4545 }
4546 if( c<0 ){
4547 lwr = idx+1;
4548 }else{
4549 upr = idx-1;
4550 }
4551 if( lwr>upr ){
4552 break;
4553 }
4554 pCur->aiIdx[pCur->iPage] = (u16)((lwr+upr)/2);
4555 }
4556 assert( lwr==upr+1 );
4557 assert( pPage->isInit );
4558 if( pPage->leaf ){
4559 chldPg = 0;
4560 }else if( lwr>=pPage->nCell ){
4561 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4562 }else{
4563 chldPg = get4byte(findCell(pPage, lwr));
4564 }
4565 if( chldPg==0 ){
4566 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4567 *pRes = c;
4568 rc = SQLITE_OK;
4569 goto moveto_finish;
4570 }
4571 pCur->aiIdx[pCur->iPage] = (u16)lwr;
4572 pCur->info.nSize = 0;
4573 pCur->validNKey = 0;
4574 rc = moveToChild(pCur, chldPg);
4575 if( rc ) goto moveto_finish;
4576 }
4577 moveto_finish:
4578 return rc;
4579 }
4580
4581
4582 /*
4583 ** Return TRUE if the cursor is not pointing at an entry of the table.
4584 **
4585 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
4586 ** past the last entry in the table or sqlite3BtreePrev() moves past
4587 ** the first entry. TRUE is also returned if the table is empty.
4588 */
sqlite3BtreeEof(BtCursor * pCur)4589 int sqlite3BtreeEof(BtCursor *pCur){
4590 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
4591 ** have been deleted? This API will need to change to return an error code
4592 ** as well as the boolean result value.
4593 */
4594 return (CURSOR_VALID!=pCur->eState);
4595 }
4596
4597 /*
4598 ** Advance the cursor to the next entry in the database. If
4599 ** successful then set *pRes=0. If the cursor
4600 ** was already pointing to the last entry in the database before
4601 ** this routine was called, then set *pRes=1.
4602 */
sqlite3BtreeNext(BtCursor * pCur,int * pRes)4603 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
4604 int rc;
4605 int idx;
4606 MemPage *pPage;
4607
4608 assert( cursorHoldsMutex(pCur) );
4609 rc = restoreCursorPosition(pCur);
4610 if( rc!=SQLITE_OK ){
4611 return rc;
4612 }
4613 assert( pRes!=0 );
4614 if( CURSOR_INVALID==pCur->eState ){
4615 *pRes = 1;
4616 return SQLITE_OK;
4617 }
4618 if( pCur->skipNext>0 ){
4619 pCur->skipNext = 0;
4620 *pRes = 0;
4621 return SQLITE_OK;
4622 }
4623 pCur->skipNext = 0;
4624
4625 pPage = pCur->apPage[pCur->iPage];
4626 idx = ++pCur->aiIdx[pCur->iPage];
4627 assert( pPage->isInit );
4628 assert( idx<=pPage->nCell );
4629
4630 pCur->info.nSize = 0;
4631 pCur->validNKey = 0;
4632 if( idx>=pPage->nCell ){
4633 if( !pPage->leaf ){
4634 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
4635 if( rc ) return rc;
4636 rc = moveToLeftmost(pCur);
4637 *pRes = 0;
4638 return rc;
4639 }
4640 do{
4641 if( pCur->iPage==0 ){
4642 *pRes = 1;
4643 pCur->eState = CURSOR_INVALID;
4644 return SQLITE_OK;
4645 }
4646 moveToParent(pCur);
4647 pPage = pCur->apPage[pCur->iPage];
4648 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
4649 *pRes = 0;
4650 if( pPage->intKey ){
4651 rc = sqlite3BtreeNext(pCur, pRes);
4652 }else{
4653 rc = SQLITE_OK;
4654 }
4655 return rc;
4656 }
4657 *pRes = 0;
4658 if( pPage->leaf ){
4659 return SQLITE_OK;
4660 }
4661 rc = moveToLeftmost(pCur);
4662 return rc;
4663 }
4664
4665
4666 /*
4667 ** Step the cursor to the back to the previous entry in the database. If
4668 ** successful then set *pRes=0. If the cursor
4669 ** was already pointing to the first entry in the database before
4670 ** this routine was called, then set *pRes=1.
4671 */
sqlite3BtreePrevious(BtCursor * pCur,int * pRes)4672 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
4673 int rc;
4674 MemPage *pPage;
4675
4676 assert( cursorHoldsMutex(pCur) );
4677 rc = restoreCursorPosition(pCur);
4678 if( rc!=SQLITE_OK ){
4679 return rc;
4680 }
4681 pCur->atLast = 0;
4682 if( CURSOR_INVALID==pCur->eState ){
4683 *pRes = 1;
4684 return SQLITE_OK;
4685 }
4686 if( pCur->skipNext<0 ){
4687 pCur->skipNext = 0;
4688 *pRes = 0;
4689 return SQLITE_OK;
4690 }
4691 pCur->skipNext = 0;
4692
4693 pPage = pCur->apPage[pCur->iPage];
4694 assert( pPage->isInit );
4695 if( !pPage->leaf ){
4696 int idx = pCur->aiIdx[pCur->iPage];
4697 rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
4698 if( rc ){
4699 return rc;
4700 }
4701 rc = moveToRightmost(pCur);
4702 }else{
4703 while( pCur->aiIdx[pCur->iPage]==0 ){
4704 if( pCur->iPage==0 ){
4705 pCur->eState = CURSOR_INVALID;
4706 *pRes = 1;
4707 return SQLITE_OK;
4708 }
4709 moveToParent(pCur);
4710 }
4711 pCur->info.nSize = 0;
4712 pCur->validNKey = 0;
4713
4714 pCur->aiIdx[pCur->iPage]--;
4715 pPage = pCur->apPage[pCur->iPage];
4716 if( pPage->intKey && !pPage->leaf ){
4717 rc = sqlite3BtreePrevious(pCur, pRes);
4718 }else{
4719 rc = SQLITE_OK;
4720 }
4721 }
4722 *pRes = 0;
4723 return rc;
4724 }
4725
4726 /*
4727 ** Allocate a new page from the database file.
4728 **
4729 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
4730 ** has already been called on the new page.) The new page has also
4731 ** been referenced and the calling routine is responsible for calling
4732 ** sqlite3PagerUnref() on the new page when it is done.
4733 **
4734 ** SQLITE_OK is returned on success. Any other return value indicates
4735 ** an error. *ppPage and *pPgno are undefined in the event of an error.
4736 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
4737 **
4738 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
4739 ** locate a page close to the page number "nearby". This can be used in an
4740 ** attempt to keep related pages close to each other in the database file,
4741 ** which in turn can make database access faster.
4742 **
4743 ** If the "exact" parameter is not 0, and the page-number nearby exists
4744 ** anywhere on the free-list, then it is guarenteed to be returned. This
4745 ** is only used by auto-vacuum databases when allocating a new table.
4746 */
allocateBtreePage(BtShared * pBt,MemPage ** ppPage,Pgno * pPgno,Pgno nearby,u8 exact)4747 static int allocateBtreePage(
4748 BtShared *pBt,
4749 MemPage **ppPage,
4750 Pgno *pPgno,
4751 Pgno nearby,
4752 u8 exact
4753 ){
4754 MemPage *pPage1;
4755 int rc;
4756 u32 n; /* Number of pages on the freelist */
4757 u32 k; /* Number of leaves on the trunk of the freelist */
4758 MemPage *pTrunk = 0;
4759 MemPage *pPrevTrunk = 0;
4760 Pgno mxPage; /* Total size of the database file */
4761
4762 assert( sqlite3_mutex_held(pBt->mutex) );
4763 pPage1 = pBt->pPage1;
4764 mxPage = btreePagecount(pBt);
4765 n = get4byte(&pPage1->aData[36]);
4766 testcase( n==mxPage-1 );
4767 if( n>=mxPage ){
4768 return SQLITE_CORRUPT_BKPT;
4769 }
4770 if( n>0 ){
4771 /* There are pages on the freelist. Reuse one of those pages. */
4772 Pgno iTrunk;
4773 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
4774
4775 /* If the 'exact' parameter was true and a query of the pointer-map
4776 ** shows that the page 'nearby' is somewhere on the free-list, then
4777 ** the entire-list will be searched for that page.
4778 */
4779 #ifndef SQLITE_OMIT_AUTOVACUUM
4780 if( exact && nearby<=mxPage ){
4781 u8 eType;
4782 assert( nearby>0 );
4783 assert( pBt->autoVacuum );
4784 rc = ptrmapGet(pBt, nearby, &eType, 0);
4785 if( rc ) return rc;
4786 if( eType==PTRMAP_FREEPAGE ){
4787 searchList = 1;
4788 }
4789 *pPgno = nearby;
4790 }
4791 #endif
4792
4793 /* Decrement the free-list count by 1. Set iTrunk to the index of the
4794 ** first free-list trunk page. iPrevTrunk is initially 1.
4795 */
4796 rc = sqlite3PagerWrite(pPage1->pDbPage);
4797 if( rc ) return rc;
4798 put4byte(&pPage1->aData[36], n-1);
4799
4800 /* The code within this loop is run only once if the 'searchList' variable
4801 ** is not true. Otherwise, it runs once for each trunk-page on the
4802 ** free-list until the page 'nearby' is located.
4803 */
4804 do {
4805 pPrevTrunk = pTrunk;
4806 if( pPrevTrunk ){
4807 iTrunk = get4byte(&pPrevTrunk->aData[0]);
4808 }else{
4809 iTrunk = get4byte(&pPage1->aData[32]);
4810 }
4811 testcase( iTrunk==mxPage );
4812 if( iTrunk>mxPage ){
4813 rc = SQLITE_CORRUPT_BKPT;
4814 }else{
4815 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
4816 }
4817 if( rc ){
4818 pTrunk = 0;
4819 goto end_allocate_page;
4820 }
4821
4822 k = get4byte(&pTrunk->aData[4]); /* # of leaves on this trunk page */
4823 if( k==0 && !searchList ){
4824 /* The trunk has no leaves and the list is not being searched.
4825 ** So extract the trunk page itself and use it as the newly
4826 ** allocated page */
4827 assert( pPrevTrunk==0 );
4828 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4829 if( rc ){
4830 goto end_allocate_page;
4831 }
4832 *pPgno = iTrunk;
4833 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4834 *ppPage = pTrunk;
4835 pTrunk = 0;
4836 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4837 }else if( k>(u32)(pBt->usableSize/4 - 2) ){
4838 /* Value of k is out of range. Database corruption */
4839 rc = SQLITE_CORRUPT_BKPT;
4840 goto end_allocate_page;
4841 #ifndef SQLITE_OMIT_AUTOVACUUM
4842 }else if( searchList && nearby==iTrunk ){
4843 /* The list is being searched and this trunk page is the page
4844 ** to allocate, regardless of whether it has leaves.
4845 */
4846 assert( *pPgno==iTrunk );
4847 *ppPage = pTrunk;
4848 searchList = 0;
4849 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4850 if( rc ){
4851 goto end_allocate_page;
4852 }
4853 if( k==0 ){
4854 if( !pPrevTrunk ){
4855 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4856 }else{
4857 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4858 if( rc!=SQLITE_OK ){
4859 goto end_allocate_page;
4860 }
4861 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4862 }
4863 }else{
4864 /* The trunk page is required by the caller but it contains
4865 ** pointers to free-list leaves. The first leaf becomes a trunk
4866 ** page in this case.
4867 */
4868 MemPage *pNewTrunk;
4869 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
4870 if( iNewTrunk>mxPage ){
4871 rc = SQLITE_CORRUPT_BKPT;
4872 goto end_allocate_page;
4873 }
4874 testcase( iNewTrunk==mxPage );
4875 rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
4876 if( rc!=SQLITE_OK ){
4877 goto end_allocate_page;
4878 }
4879 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
4880 if( rc!=SQLITE_OK ){
4881 releasePage(pNewTrunk);
4882 goto end_allocate_page;
4883 }
4884 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4885 put4byte(&pNewTrunk->aData[4], k-1);
4886 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
4887 releasePage(pNewTrunk);
4888 if( !pPrevTrunk ){
4889 assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
4890 put4byte(&pPage1->aData[32], iNewTrunk);
4891 }else{
4892 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4893 if( rc ){
4894 goto end_allocate_page;
4895 }
4896 put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4897 }
4898 }
4899 pTrunk = 0;
4900 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4901 #endif
4902 }else if( k>0 ){
4903 /* Extract a leaf from the trunk */
4904 u32 closest;
4905 Pgno iPage;
4906 unsigned char *aData = pTrunk->aData;
4907 if( nearby>0 ){
4908 u32 i;
4909 int dist;
4910 closest = 0;
4911 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
4912 for(i=1; i<k; i++){
4913 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
4914 if( d2<dist ){
4915 closest = i;
4916 dist = d2;
4917 }
4918 }
4919 }else{
4920 closest = 0;
4921 }
4922
4923 iPage = get4byte(&aData[8+closest*4]);
4924 testcase( iPage==mxPage );
4925 if( iPage>mxPage ){
4926 rc = SQLITE_CORRUPT_BKPT;
4927 goto end_allocate_page;
4928 }
4929 testcase( iPage==mxPage );
4930 if( !searchList || iPage==nearby ){
4931 int noContent;
4932 *pPgno = iPage;
4933 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4934 ": %d more free pages\n",
4935 *pPgno, closest+1, k, pTrunk->pgno, n-1));
4936 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4937 if( rc ) goto end_allocate_page;
4938 if( closest<k-1 ){
4939 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4940 }
4941 put4byte(&aData[4], k-1);
4942 noContent = !btreeGetHasContent(pBt, *pPgno);
4943 rc = btreeGetPage(pBt, *pPgno, ppPage, noContent);
4944 if( rc==SQLITE_OK ){
4945 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4946 if( rc!=SQLITE_OK ){
4947 releasePage(*ppPage);
4948 }
4949 }
4950 searchList = 0;
4951 }
4952 }
4953 releasePage(pPrevTrunk);
4954 pPrevTrunk = 0;
4955 }while( searchList );
4956 }else{
4957 /* There are no pages on the freelist, so create a new page at the
4958 ** end of the file */
4959 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
4960 if( rc ) return rc;
4961 pBt->nPage++;
4962 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
4963
4964 #ifndef SQLITE_OMIT_AUTOVACUUM
4965 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
4966 /* If *pPgno refers to a pointer-map page, allocate two new pages
4967 ** at the end of the file instead of one. The first allocated page
4968 ** becomes a new pointer-map page, the second is used by the caller.
4969 */
4970 MemPage *pPg = 0;
4971 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
4972 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
4973 rc = btreeGetPage(pBt, pBt->nPage, &pPg, 1);
4974 if( rc==SQLITE_OK ){
4975 rc = sqlite3PagerWrite(pPg->pDbPage);
4976 releasePage(pPg);
4977 }
4978 if( rc ) return rc;
4979 pBt->nPage++;
4980 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
4981 }
4982 #endif
4983 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
4984 *pPgno = pBt->nPage;
4985
4986 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4987 rc = btreeGetPage(pBt, *pPgno, ppPage, 1);
4988 if( rc ) return rc;
4989 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4990 if( rc!=SQLITE_OK ){
4991 releasePage(*ppPage);
4992 }
4993 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
4994 }
4995
4996 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4997
4998 end_allocate_page:
4999 releasePage(pTrunk);
5000 releasePage(pPrevTrunk);
5001 if( rc==SQLITE_OK ){
5002 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
5003 releasePage(*ppPage);
5004 return SQLITE_CORRUPT_BKPT;
5005 }
5006 (*ppPage)->isInit = 0;
5007 }else{
5008 *ppPage = 0;
5009 }
5010 assert( rc!=SQLITE_OK || sqlite3PagerIswriteable((*ppPage)->pDbPage) );
5011 return rc;
5012 }
5013
5014 /*
5015 ** This function is used to add page iPage to the database file free-list.
5016 ** It is assumed that the page is not already a part of the free-list.
5017 **
5018 ** The value passed as the second argument to this function is optional.
5019 ** If the caller happens to have a pointer to the MemPage object
5020 ** corresponding to page iPage handy, it may pass it as the second value.
5021 ** Otherwise, it may pass NULL.
5022 **
5023 ** If a pointer to a MemPage object is passed as the second argument,
5024 ** its reference count is not altered by this function.
5025 */
freePage2(BtShared * pBt,MemPage * pMemPage,Pgno iPage)5026 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
5027 MemPage *pTrunk = 0; /* Free-list trunk page */
5028 Pgno iTrunk = 0; /* Page number of free-list trunk page */
5029 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */
5030 MemPage *pPage; /* Page being freed. May be NULL. */
5031 int rc; /* Return Code */
5032 int nFree; /* Initial number of pages on free-list */
5033
5034 assert( sqlite3_mutex_held(pBt->mutex) );
5035 assert( iPage>1 );
5036 assert( !pMemPage || pMemPage->pgno==iPage );
5037
5038 if( pMemPage ){
5039 pPage = pMemPage;
5040 sqlite3PagerRef(pPage->pDbPage);
5041 }else{
5042 pPage = btreePageLookup(pBt, iPage);
5043 }
5044
5045 /* Increment the free page count on pPage1 */
5046 rc = sqlite3PagerWrite(pPage1->pDbPage);
5047 if( rc ) goto freepage_out;
5048 nFree = get4byte(&pPage1->aData[36]);
5049 put4byte(&pPage1->aData[36], nFree+1);
5050
5051 if( pBt->secureDelete ){
5052 /* If the secure_delete option is enabled, then
5053 ** always fully overwrite deleted information with zeros.
5054 */
5055 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
5056 || ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
5057 ){
5058 goto freepage_out;
5059 }
5060 memset(pPage->aData, 0, pPage->pBt->pageSize);
5061 }
5062
5063 /* If the database supports auto-vacuum, write an entry in the pointer-map
5064 ** to indicate that the page is free.
5065 */
5066 if( ISAUTOVACUUM ){
5067 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
5068 if( rc ) goto freepage_out;
5069 }
5070
5071 /* Now manipulate the actual database free-list structure. There are two
5072 ** possibilities. If the free-list is currently empty, or if the first
5073 ** trunk page in the free-list is full, then this page will become a
5074 ** new free-list trunk page. Otherwise, it will become a leaf of the
5075 ** first trunk page in the current free-list. This block tests if it
5076 ** is possible to add the page as a new free-list leaf.
5077 */
5078 if( nFree!=0 ){
5079 u32 nLeaf; /* Initial number of leaf cells on trunk page */
5080
5081 iTrunk = get4byte(&pPage1->aData[32]);
5082 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
5083 if( rc!=SQLITE_OK ){
5084 goto freepage_out;
5085 }
5086
5087 nLeaf = get4byte(&pTrunk->aData[4]);
5088 assert( pBt->usableSize>32 );
5089 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
5090 rc = SQLITE_CORRUPT_BKPT;
5091 goto freepage_out;
5092 }
5093 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
5094 /* In this case there is room on the trunk page to insert the page
5095 ** being freed as a new leaf.
5096 **
5097 ** Note that the trunk page is not really full until it contains
5098 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
5099 ** coded. But due to a coding error in versions of SQLite prior to
5100 ** 3.6.0, databases with freelist trunk pages holding more than
5101 ** usableSize/4 - 8 entries will be reported as corrupt. In order
5102 ** to maintain backwards compatibility with older versions of SQLite,
5103 ** we will continue to restrict the number of entries to usableSize/4 - 8
5104 ** for now. At some point in the future (once everyone has upgraded
5105 ** to 3.6.0 or later) we should consider fixing the conditional above
5106 ** to read "usableSize/4-2" instead of "usableSize/4-8".
5107 */
5108 rc = sqlite3PagerWrite(pTrunk->pDbPage);
5109 if( rc==SQLITE_OK ){
5110 put4byte(&pTrunk->aData[4], nLeaf+1);
5111 put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
5112 if( pPage && !pBt->secureDelete ){
5113 sqlite3PagerDontWrite(pPage->pDbPage);
5114 }
5115 rc = btreeSetHasContent(pBt, iPage);
5116 }
5117 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
5118 goto freepage_out;
5119 }
5120 }
5121
5122 /* If control flows to this point, then it was not possible to add the
5123 ** the page being freed as a leaf page of the first trunk in the free-list.
5124 ** Possibly because the free-list is empty, or possibly because the
5125 ** first trunk in the free-list is full. Either way, the page being freed
5126 ** will become the new first trunk page in the free-list.
5127 */
5128 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
5129 goto freepage_out;
5130 }
5131 rc = sqlite3PagerWrite(pPage->pDbPage);
5132 if( rc!=SQLITE_OK ){
5133 goto freepage_out;
5134 }
5135 put4byte(pPage->aData, iTrunk);
5136 put4byte(&pPage->aData[4], 0);
5137 put4byte(&pPage1->aData[32], iPage);
5138 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
5139
5140 freepage_out:
5141 if( pPage ){
5142 pPage->isInit = 0;
5143 }
5144 releasePage(pPage);
5145 releasePage(pTrunk);
5146 return rc;
5147 }
freePage(MemPage * pPage,int * pRC)5148 static void freePage(MemPage *pPage, int *pRC){
5149 if( (*pRC)==SQLITE_OK ){
5150 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
5151 }
5152 }
5153
5154 /*
5155 ** Free any overflow pages associated with the given Cell.
5156 */
clearCell(MemPage * pPage,unsigned char * pCell)5157 static int clearCell(MemPage *pPage, unsigned char *pCell){
5158 BtShared *pBt = pPage->pBt;
5159 CellInfo info;
5160 Pgno ovflPgno;
5161 int rc;
5162 int nOvfl;
5163 u32 ovflPageSize;
5164
5165 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5166 btreeParseCellPtr(pPage, pCell, &info);
5167 if( info.iOverflow==0 ){
5168 return SQLITE_OK; /* No overflow pages. Return without doing anything */
5169 }
5170 ovflPgno = get4byte(&pCell[info.iOverflow]);
5171 assert( pBt->usableSize > 4 );
5172 ovflPageSize = pBt->usableSize - 4;
5173 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
5174 assert( ovflPgno==0 || nOvfl>0 );
5175 while( nOvfl-- ){
5176 Pgno iNext = 0;
5177 MemPage *pOvfl = 0;
5178 if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
5179 /* 0 is not a legal page number and page 1 cannot be an
5180 ** overflow page. Therefore if ovflPgno<2 or past the end of the
5181 ** file the database must be corrupt. */
5182 return SQLITE_CORRUPT_BKPT;
5183 }
5184 if( nOvfl ){
5185 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
5186 if( rc ) return rc;
5187 }
5188
5189 if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
5190 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
5191 ){
5192 /* There is no reason any cursor should have an outstanding reference
5193 ** to an overflow page belonging to a cell that is being deleted/updated.
5194 ** So if there exists more than one reference to this page, then it
5195 ** must not really be an overflow page and the database must be corrupt.
5196 ** It is helpful to detect this before calling freePage2(), as
5197 ** freePage2() may zero the page contents if secure-delete mode is
5198 ** enabled. If this 'overflow' page happens to be a page that the
5199 ** caller is iterating through or using in some other way, this
5200 ** can be problematic.
5201 */
5202 rc = SQLITE_CORRUPT_BKPT;
5203 }else{
5204 rc = freePage2(pBt, pOvfl, ovflPgno);
5205 }
5206
5207 if( pOvfl ){
5208 sqlite3PagerUnref(pOvfl->pDbPage);
5209 }
5210 if( rc ) return rc;
5211 ovflPgno = iNext;
5212 }
5213 return SQLITE_OK;
5214 }
5215
5216 /*
5217 ** Create the byte sequence used to represent a cell on page pPage
5218 ** and write that byte sequence into pCell[]. Overflow pages are
5219 ** allocated and filled in as necessary. The calling procedure
5220 ** is responsible for making sure sufficient space has been allocated
5221 ** for pCell[].
5222 **
5223 ** Note that pCell does not necessary need to point to the pPage->aData
5224 ** area. pCell might point to some temporary storage. The cell will
5225 ** be constructed in this temporary area then copied into pPage->aData
5226 ** later.
5227 */
fillInCell(MemPage * pPage,unsigned char * pCell,const void * pKey,i64 nKey,const void * pData,int nData,int nZero,int * pnSize)5228 static int fillInCell(
5229 MemPage *pPage, /* The page that contains the cell */
5230 unsigned char *pCell, /* Complete text of the cell */
5231 const void *pKey, i64 nKey, /* The key */
5232 const void *pData,int nData, /* The data */
5233 int nZero, /* Extra zero bytes to append to pData */
5234 int *pnSize /* Write cell size here */
5235 ){
5236 int nPayload;
5237 const u8 *pSrc;
5238 int nSrc, n, rc;
5239 int spaceLeft;
5240 MemPage *pOvfl = 0;
5241 MemPage *pToRelease = 0;
5242 unsigned char *pPrior;
5243 unsigned char *pPayload;
5244 BtShared *pBt = pPage->pBt;
5245 Pgno pgnoOvfl = 0;
5246 int nHeader;
5247 CellInfo info;
5248
5249 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5250
5251 /* pPage is not necessarily writeable since pCell might be auxiliary
5252 ** buffer space that is separate from the pPage buffer area */
5253 assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
5254 || sqlite3PagerIswriteable(pPage->pDbPage) );
5255
5256 /* Fill in the header. */
5257 nHeader = 0;
5258 if( !pPage->leaf ){
5259 nHeader += 4;
5260 }
5261 if( pPage->hasData ){
5262 nHeader += putVarint(&pCell[nHeader], nData+nZero);
5263 }else{
5264 nData = nZero = 0;
5265 }
5266 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
5267 btreeParseCellPtr(pPage, pCell, &info);
5268 assert( info.nHeader==nHeader );
5269 assert( info.nKey==nKey );
5270 assert( info.nData==(u32)(nData+nZero) );
5271
5272 /* Fill in the payload */
5273 nPayload = nData + nZero;
5274 if( pPage->intKey ){
5275 pSrc = pData;
5276 nSrc = nData;
5277 nData = 0;
5278 }else{
5279 if( NEVER(nKey>0x7fffffff || pKey==0) ){
5280 return SQLITE_CORRUPT_BKPT;
5281 }
5282 nPayload += (int)nKey;
5283 pSrc = pKey;
5284 nSrc = (int)nKey;
5285 }
5286 *pnSize = info.nSize;
5287 spaceLeft = info.nLocal;
5288 pPayload = &pCell[nHeader];
5289 pPrior = &pCell[info.iOverflow];
5290
5291 while( nPayload>0 ){
5292 if( spaceLeft==0 ){
5293 #ifndef SQLITE_OMIT_AUTOVACUUM
5294 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
5295 if( pBt->autoVacuum ){
5296 do{
5297 pgnoOvfl++;
5298 } while(
5299 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
5300 );
5301 }
5302 #endif
5303 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
5304 #ifndef SQLITE_OMIT_AUTOVACUUM
5305 /* If the database supports auto-vacuum, and the second or subsequent
5306 ** overflow page is being allocated, add an entry to the pointer-map
5307 ** for that page now.
5308 **
5309 ** If this is the first overflow page, then write a partial entry
5310 ** to the pointer-map. If we write nothing to this pointer-map slot,
5311 ** then the optimistic overflow chain processing in clearCell()
5312 ** may misinterpret the uninitialised values and delete the
5313 ** wrong pages from the database.
5314 */
5315 if( pBt->autoVacuum && rc==SQLITE_OK ){
5316 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
5317 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
5318 if( rc ){
5319 releasePage(pOvfl);
5320 }
5321 }
5322 #endif
5323 if( rc ){
5324 releasePage(pToRelease);
5325 return rc;
5326 }
5327
5328 /* If pToRelease is not zero than pPrior points into the data area
5329 ** of pToRelease. Make sure pToRelease is still writeable. */
5330 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5331
5332 /* If pPrior is part of the data area of pPage, then make sure pPage
5333 ** is still writeable */
5334 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
5335 || sqlite3PagerIswriteable(pPage->pDbPage) );
5336
5337 put4byte(pPrior, pgnoOvfl);
5338 releasePage(pToRelease);
5339 pToRelease = pOvfl;
5340 pPrior = pOvfl->aData;
5341 put4byte(pPrior, 0);
5342 pPayload = &pOvfl->aData[4];
5343 spaceLeft = pBt->usableSize - 4;
5344 }
5345 n = nPayload;
5346 if( n>spaceLeft ) n = spaceLeft;
5347
5348 /* If pToRelease is not zero than pPayload points into the data area
5349 ** of pToRelease. Make sure pToRelease is still writeable. */
5350 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5351
5352 /* If pPayload is part of the data area of pPage, then make sure pPage
5353 ** is still writeable */
5354 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
5355 || sqlite3PagerIswriteable(pPage->pDbPage) );
5356
5357 if( nSrc>0 ){
5358 if( n>nSrc ) n = nSrc;
5359 assert( pSrc );
5360 memcpy(pPayload, pSrc, n);
5361 }else{
5362 memset(pPayload, 0, n);
5363 }
5364 nPayload -= n;
5365 pPayload += n;
5366 pSrc += n;
5367 nSrc -= n;
5368 spaceLeft -= n;
5369 if( nSrc==0 ){
5370 nSrc = nData;
5371 pSrc = pData;
5372 }
5373 }
5374 releasePage(pToRelease);
5375 return SQLITE_OK;
5376 }
5377
5378 /*
5379 ** Remove the i-th cell from pPage. This routine effects pPage only.
5380 ** The cell content is not freed or deallocated. It is assumed that
5381 ** the cell content has been copied someplace else. This routine just
5382 ** removes the reference to the cell from pPage.
5383 **
5384 ** "sz" must be the number of bytes in the cell.
5385 */
dropCell(MemPage * pPage,int idx,int sz,int * pRC)5386 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
5387 int i; /* Loop counter */
5388 u32 pc; /* Offset to cell content of cell being deleted */
5389 u8 *data; /* pPage->aData */
5390 u8 *ptr; /* Used to move bytes around within data[] */
5391 int rc; /* The return code */
5392 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */
5393
5394 if( *pRC ) return;
5395
5396 assert( idx>=0 && idx<pPage->nCell );
5397 assert( sz==cellSize(pPage, idx) );
5398 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5399 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5400 data = pPage->aData;
5401 ptr = &data[pPage->cellOffset + 2*idx];
5402 pc = get2byte(ptr);
5403 hdr = pPage->hdrOffset;
5404 testcase( pc==get2byte(&data[hdr+5]) );
5405 testcase( pc+sz==pPage->pBt->usableSize );
5406 if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){
5407 *pRC = SQLITE_CORRUPT_BKPT;
5408 return;
5409 }
5410 rc = freeSpace(pPage, pc, sz);
5411 if( rc ){
5412 *pRC = rc;
5413 return;
5414 }
5415 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
5416 ptr[0] = ptr[2];
5417 ptr[1] = ptr[3];
5418 }
5419 pPage->nCell--;
5420 put2byte(&data[hdr+3], pPage->nCell);
5421 pPage->nFree += 2;
5422 }
5423
5424 /*
5425 ** Insert a new cell on pPage at cell index "i". pCell points to the
5426 ** content of the cell.
5427 **
5428 ** If the cell content will fit on the page, then put it there. If it
5429 ** will not fit, then make a copy of the cell content into pTemp if
5430 ** pTemp is not null. Regardless of pTemp, allocate a new entry
5431 ** in pPage->aOvfl[] and make it point to the cell content (either
5432 ** in pTemp or the original pCell) and also record its index.
5433 ** Allocating a new entry in pPage->aCell[] implies that
5434 ** pPage->nOverflow is incremented.
5435 **
5436 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
5437 ** cell. The caller will overwrite them after this function returns. If
5438 ** nSkip is non-zero, then pCell may not point to an invalid memory location
5439 ** (but pCell+nSkip is always valid).
5440 */
insertCell(MemPage * pPage,int i,u8 * pCell,int sz,u8 * pTemp,Pgno iChild,int * pRC)5441 static void insertCell(
5442 MemPage *pPage, /* Page into which we are copying */
5443 int i, /* New cell becomes the i-th cell of the page */
5444 u8 *pCell, /* Content of the new cell */
5445 int sz, /* Bytes of content in pCell */
5446 u8 *pTemp, /* Temp storage space for pCell, if needed */
5447 Pgno iChild, /* If non-zero, replace first 4 bytes with this value */
5448 int *pRC /* Read and write return code from here */
5449 ){
5450 int idx = 0; /* Where to write new cell content in data[] */
5451 int j; /* Loop counter */
5452 int end; /* First byte past the last cell pointer in data[] */
5453 int ins; /* Index in data[] where new cell pointer is inserted */
5454 int cellOffset; /* Address of first cell pointer in data[] */
5455 u8 *data; /* The content of the whole page */
5456 u8 *ptr; /* Used for moving information around in data[] */
5457
5458 int nSkip = (iChild ? 4 : 0);
5459
5460 if( *pRC ) return;
5461
5462 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
5463 assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=10921 );
5464 assert( pPage->nOverflow<=ArraySize(pPage->aOvfl) );
5465 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5466 /* The cell should normally be sized correctly. However, when moving a
5467 ** malformed cell from a leaf page to an interior page, if the cell size
5468 ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
5469 ** might be less than 8 (leaf-size + pointer) on the interior node. Hence
5470 ** the term after the || in the following assert(). */
5471 assert( sz==cellSizePtr(pPage, pCell) || (sz==8 && iChild>0) );
5472 if( pPage->nOverflow || sz+2>pPage->nFree ){
5473 if( pTemp ){
5474 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
5475 pCell = pTemp;
5476 }
5477 if( iChild ){
5478 put4byte(pCell, iChild);
5479 }
5480 j = pPage->nOverflow++;
5481 assert( j<(int)(sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0])) );
5482 pPage->aOvfl[j].pCell = pCell;
5483 pPage->aOvfl[j].idx = (u16)i;
5484 }else{
5485 int rc = sqlite3PagerWrite(pPage->pDbPage);
5486 if( rc!=SQLITE_OK ){
5487 *pRC = rc;
5488 return;
5489 }
5490 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5491 data = pPage->aData;
5492 cellOffset = pPage->cellOffset;
5493 end = cellOffset + 2*pPage->nCell;
5494 ins = cellOffset + 2*i;
5495 rc = allocateSpace(pPage, sz, &idx);
5496 if( rc ){ *pRC = rc; return; }
5497 /* The allocateSpace() routine guarantees the following two properties
5498 ** if it returns success */
5499 assert( idx >= end+2 );
5500 assert( idx+sz <= (int)pPage->pBt->usableSize );
5501 pPage->nCell++;
5502 pPage->nFree -= (u16)(2 + sz);
5503 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
5504 if( iChild ){
5505 put4byte(&data[idx], iChild);
5506 }
5507 for(j=end, ptr=&data[j]; j>ins; j-=2, ptr-=2){
5508 ptr[0] = ptr[-2];
5509 ptr[1] = ptr[-1];
5510 }
5511 put2byte(&data[ins], idx);
5512 put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
5513 #ifndef SQLITE_OMIT_AUTOVACUUM
5514 if( pPage->pBt->autoVacuum ){
5515 /* The cell may contain a pointer to an overflow page. If so, write
5516 ** the entry for the overflow page into the pointer map.
5517 */
5518 ptrmapPutOvflPtr(pPage, pCell, pRC);
5519 }
5520 #endif
5521 }
5522 }
5523
5524 /*
5525 ** Add a list of cells to a page. The page should be initially empty.
5526 ** The cells are guaranteed to fit on the page.
5527 */
assemblePage(MemPage * pPage,int nCell,u8 ** apCell,u16 * aSize)5528 static void assemblePage(
5529 MemPage *pPage, /* The page to be assemblied */
5530 int nCell, /* The number of cells to add to this page */
5531 u8 **apCell, /* Pointers to cell bodies */
5532 u16 *aSize /* Sizes of the cells */
5533 ){
5534 int i; /* Loop counter */
5535 u8 *pCellptr; /* Address of next cell pointer */
5536 int cellbody; /* Address of next cell body */
5537 u8 * const data = pPage->aData; /* Pointer to data for pPage */
5538 const int hdr = pPage->hdrOffset; /* Offset of header on pPage */
5539 const int nUsable = pPage->pBt->usableSize; /* Usable size of page */
5540
5541 assert( pPage->nOverflow==0 );
5542 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5543 assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt)
5544 && (int)MX_CELL(pPage->pBt)<=10921);
5545 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5546
5547 /* Check that the page has just been zeroed by zeroPage() */
5548 assert( pPage->nCell==0 );
5549 assert( get2byteNotZero(&data[hdr+5])==nUsable );
5550
5551 pCellptr = &data[pPage->cellOffset + nCell*2];
5552 cellbody = nUsable;
5553 for(i=nCell-1; i>=0; i--){
5554 pCellptr -= 2;
5555 cellbody -= aSize[i];
5556 put2byte(pCellptr, cellbody);
5557 memcpy(&data[cellbody], apCell[i], aSize[i]);
5558 }
5559 put2byte(&data[hdr+3], nCell);
5560 put2byte(&data[hdr+5], cellbody);
5561 pPage->nFree -= (nCell*2 + nUsable - cellbody);
5562 pPage->nCell = (u16)nCell;
5563 }
5564
5565 /*
5566 ** The following parameters determine how many adjacent pages get involved
5567 ** in a balancing operation. NN is the number of neighbors on either side
5568 ** of the page that participate in the balancing operation. NB is the
5569 ** total number of pages that participate, including the target page and
5570 ** NN neighbors on either side.
5571 **
5572 ** The minimum value of NN is 1 (of course). Increasing NN above 1
5573 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
5574 ** in exchange for a larger degradation in INSERT and UPDATE performance.
5575 ** The value of NN appears to give the best results overall.
5576 */
5577 #define NN 1 /* Number of neighbors on either side of pPage */
5578 #define NB (NN*2+1) /* Total pages involved in the balance */
5579
5580
5581 #ifndef SQLITE_OMIT_QUICKBALANCE
5582 /*
5583 ** This version of balance() handles the common special case where
5584 ** a new entry is being inserted on the extreme right-end of the
5585 ** tree, in other words, when the new entry will become the largest
5586 ** entry in the tree.
5587 **
5588 ** Instead of trying to balance the 3 right-most leaf pages, just add
5589 ** a new page to the right-hand side and put the one new entry in
5590 ** that page. This leaves the right side of the tree somewhat
5591 ** unbalanced. But odds are that we will be inserting new entries
5592 ** at the end soon afterwards so the nearly empty page will quickly
5593 ** fill up. On average.
5594 **
5595 ** pPage is the leaf page which is the right-most page in the tree.
5596 ** pParent is its parent. pPage must have a single overflow entry
5597 ** which is also the right-most entry on the page.
5598 **
5599 ** The pSpace buffer is used to store a temporary copy of the divider
5600 ** cell that will be inserted into pParent. Such a cell consists of a 4
5601 ** byte page number followed by a variable length integer. In other
5602 ** words, at most 13 bytes. Hence the pSpace buffer must be at
5603 ** least 13 bytes in size.
5604 */
balance_quick(MemPage * pParent,MemPage * pPage,u8 * pSpace)5605 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
5606 BtShared *const pBt = pPage->pBt; /* B-Tree Database */
5607 MemPage *pNew; /* Newly allocated page */
5608 int rc; /* Return Code */
5609 Pgno pgnoNew; /* Page number of pNew */
5610
5611 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5612 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5613 assert( pPage->nOverflow==1 );
5614
5615 /* This error condition is now caught prior to reaching this function */
5616 if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT;
5617
5618 /* Allocate a new page. This page will become the right-sibling of
5619 ** pPage. Make the parent page writable, so that the new divider cell
5620 ** may be inserted. If both these operations are successful, proceed.
5621 */
5622 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
5623
5624 if( rc==SQLITE_OK ){
5625
5626 u8 *pOut = &pSpace[4];
5627 u8 *pCell = pPage->aOvfl[0].pCell;
5628 u16 szCell = cellSizePtr(pPage, pCell);
5629 u8 *pStop;
5630
5631 assert( sqlite3PagerIswriteable(pNew->pDbPage) );
5632 assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
5633 zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
5634 assemblePage(pNew, 1, &pCell, &szCell);
5635
5636 /* If this is an auto-vacuum database, update the pointer map
5637 ** with entries for the new page, and any pointer from the
5638 ** cell on the page to an overflow page. If either of these
5639 ** operations fails, the return code is set, but the contents
5640 ** of the parent page are still manipulated by thh code below.
5641 ** That is Ok, at this point the parent page is guaranteed to
5642 ** be marked as dirty. Returning an error code will cause a
5643 ** rollback, undoing any changes made to the parent page.
5644 */
5645 if( ISAUTOVACUUM ){
5646 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
5647 if( szCell>pNew->minLocal ){
5648 ptrmapPutOvflPtr(pNew, pCell, &rc);
5649 }
5650 }
5651
5652 /* Create a divider cell to insert into pParent. The divider cell
5653 ** consists of a 4-byte page number (the page number of pPage) and
5654 ** a variable length key value (which must be the same value as the
5655 ** largest key on pPage).
5656 **
5657 ** To find the largest key value on pPage, first find the right-most
5658 ** cell on pPage. The first two fields of this cell are the
5659 ** record-length (a variable length integer at most 32-bits in size)
5660 ** and the key value (a variable length integer, may have any value).
5661 ** The first of the while(...) loops below skips over the record-length
5662 ** field. The second while(...) loop copies the key value from the
5663 ** cell on pPage into the pSpace buffer.
5664 */
5665 pCell = findCell(pPage, pPage->nCell-1);
5666 pStop = &pCell[9];
5667 while( (*(pCell++)&0x80) && pCell<pStop );
5668 pStop = &pCell[9];
5669 while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
5670
5671 /* Insert the new divider cell into pParent. */
5672 insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
5673 0, pPage->pgno, &rc);
5674
5675 /* Set the right-child pointer of pParent to point to the new page. */
5676 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
5677
5678 /* Release the reference to the new page. */
5679 releasePage(pNew);
5680 }
5681
5682 return rc;
5683 }
5684 #endif /* SQLITE_OMIT_QUICKBALANCE */
5685
5686 #if 0
5687 /*
5688 ** This function does not contribute anything to the operation of SQLite.
5689 ** it is sometimes activated temporarily while debugging code responsible
5690 ** for setting pointer-map entries.
5691 */
5692 static int ptrmapCheckPages(MemPage **apPage, int nPage){
5693 int i, j;
5694 for(i=0; i<nPage; i++){
5695 Pgno n;
5696 u8 e;
5697 MemPage *pPage = apPage[i];
5698 BtShared *pBt = pPage->pBt;
5699 assert( pPage->isInit );
5700
5701 for(j=0; j<pPage->nCell; j++){
5702 CellInfo info;
5703 u8 *z;
5704
5705 z = findCell(pPage, j);
5706 btreeParseCellPtr(pPage, z, &info);
5707 if( info.iOverflow ){
5708 Pgno ovfl = get4byte(&z[info.iOverflow]);
5709 ptrmapGet(pBt, ovfl, &e, &n);
5710 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
5711 }
5712 if( !pPage->leaf ){
5713 Pgno child = get4byte(z);
5714 ptrmapGet(pBt, child, &e, &n);
5715 assert( n==pPage->pgno && e==PTRMAP_BTREE );
5716 }
5717 }
5718 if( !pPage->leaf ){
5719 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5720 ptrmapGet(pBt, child, &e, &n);
5721 assert( n==pPage->pgno && e==PTRMAP_BTREE );
5722 }
5723 }
5724 return 1;
5725 }
5726 #endif
5727
5728 /*
5729 ** This function is used to copy the contents of the b-tree node stored
5730 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
5731 ** the pointer-map entries for each child page are updated so that the
5732 ** parent page stored in the pointer map is page pTo. If pFrom contained
5733 ** any cells with overflow page pointers, then the corresponding pointer
5734 ** map entries are also updated so that the parent page is page pTo.
5735 **
5736 ** If pFrom is currently carrying any overflow cells (entries in the
5737 ** MemPage.aOvfl[] array), they are not copied to pTo.
5738 **
5739 ** Before returning, page pTo is reinitialized using btreeInitPage().
5740 **
5741 ** The performance of this function is not critical. It is only used by
5742 ** the balance_shallower() and balance_deeper() procedures, neither of
5743 ** which are called often under normal circumstances.
5744 */
copyNodeContent(MemPage * pFrom,MemPage * pTo,int * pRC)5745 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
5746 if( (*pRC)==SQLITE_OK ){
5747 BtShared * const pBt = pFrom->pBt;
5748 u8 * const aFrom = pFrom->aData;
5749 u8 * const aTo = pTo->aData;
5750 int const iFromHdr = pFrom->hdrOffset;
5751 int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
5752 int rc;
5753 int iData;
5754
5755
5756 assert( pFrom->isInit );
5757 assert( pFrom->nFree>=iToHdr );
5758 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
5759
5760 /* Copy the b-tree node content from page pFrom to page pTo. */
5761 iData = get2byte(&aFrom[iFromHdr+5]);
5762 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
5763 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
5764
5765 /* Reinitialize page pTo so that the contents of the MemPage structure
5766 ** match the new data. The initialization of pTo can actually fail under
5767 ** fairly obscure circumstances, even though it is a copy of initialized
5768 ** page pFrom.
5769 */
5770 pTo->isInit = 0;
5771 rc = btreeInitPage(pTo);
5772 if( rc!=SQLITE_OK ){
5773 *pRC = rc;
5774 return;
5775 }
5776
5777 /* If this is an auto-vacuum database, update the pointer-map entries
5778 ** for any b-tree or overflow pages that pTo now contains the pointers to.
5779 */
5780 if( ISAUTOVACUUM ){
5781 *pRC = setChildPtrmaps(pTo);
5782 }
5783 }
5784 }
5785
5786 /*
5787 ** This routine redistributes cells on the iParentIdx'th child of pParent
5788 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
5789 ** same amount of free space. Usually a single sibling on either side of the
5790 ** page are used in the balancing, though both siblings might come from one
5791 ** side if the page is the first or last child of its parent. If the page
5792 ** has fewer than 2 siblings (something which can only happen if the page
5793 ** is a root page or a child of a root page) then all available siblings
5794 ** participate in the balancing.
5795 **
5796 ** The number of siblings of the page might be increased or decreased by
5797 ** one or two in an effort to keep pages nearly full but not over full.
5798 **
5799 ** Note that when this routine is called, some of the cells on the page
5800 ** might not actually be stored in MemPage.aData[]. This can happen
5801 ** if the page is overfull. This routine ensures that all cells allocated
5802 ** to the page and its siblings fit into MemPage.aData[] before returning.
5803 **
5804 ** In the course of balancing the page and its siblings, cells may be
5805 ** inserted into or removed from the parent page (pParent). Doing so
5806 ** may cause the parent page to become overfull or underfull. If this
5807 ** happens, it is the responsibility of the caller to invoke the correct
5808 ** balancing routine to fix this problem (see the balance() routine).
5809 **
5810 ** If this routine fails for any reason, it might leave the database
5811 ** in a corrupted state. So if this routine fails, the database should
5812 ** be rolled back.
5813 **
5814 ** The third argument to this function, aOvflSpace, is a pointer to a
5815 ** buffer big enough to hold one page. If while inserting cells into the parent
5816 ** page (pParent) the parent page becomes overfull, this buffer is
5817 ** used to store the parent's overflow cells. Because this function inserts
5818 ** a maximum of four divider cells into the parent page, and the maximum
5819 ** size of a cell stored within an internal node is always less than 1/4
5820 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
5821 ** enough for all overflow cells.
5822 **
5823 ** If aOvflSpace is set to a null pointer, this function returns
5824 ** SQLITE_NOMEM.
5825 */
balance_nonroot(MemPage * pParent,int iParentIdx,u8 * aOvflSpace,int isRoot)5826 static int balance_nonroot(
5827 MemPage *pParent, /* Parent page of siblings being balanced */
5828 int iParentIdx, /* Index of "the page" in pParent */
5829 u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */
5830 int isRoot /* True if pParent is a root-page */
5831 ){
5832 BtShared *pBt; /* The whole database */
5833 int nCell = 0; /* Number of cells in apCell[] */
5834 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
5835 int nNew = 0; /* Number of pages in apNew[] */
5836 int nOld; /* Number of pages in apOld[] */
5837 int i, j, k; /* Loop counters */
5838 int nxDiv; /* Next divider slot in pParent->aCell[] */
5839 int rc = SQLITE_OK; /* The return code */
5840 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */
5841 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
5842 int usableSpace; /* Bytes in pPage beyond the header */
5843 int pageFlags; /* Value of pPage->aData[0] */
5844 int subtotal; /* Subtotal of bytes in cells on one page */
5845 int iSpace1 = 0; /* First unused byte of aSpace1[] */
5846 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */
5847 int szScratch; /* Size of scratch memory requested */
5848 MemPage *apOld[NB]; /* pPage and up to two siblings */
5849 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
5850 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
5851 u8 *pRight; /* Location in parent of right-sibling pointer */
5852 u8 *apDiv[NB-1]; /* Divider cells in pParent */
5853 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
5854 int szNew[NB+2]; /* Combined size of cells place on i-th page */
5855 u8 **apCell = 0; /* All cells begin balanced */
5856 u16 *szCell; /* Local size of all cells in apCell[] */
5857 u8 *aSpace1; /* Space for copies of dividers cells */
5858 Pgno pgno; /* Temp var to store a page number in */
5859
5860 pBt = pParent->pBt;
5861 assert( sqlite3_mutex_held(pBt->mutex) );
5862 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5863
5864 #if 0
5865 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
5866 #endif
5867
5868 /* At this point pParent may have at most one overflow cell. And if
5869 ** this overflow cell is present, it must be the cell with
5870 ** index iParentIdx. This scenario comes about when this function
5871 ** is called (indirectly) from sqlite3BtreeDelete().
5872 */
5873 assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
5874 assert( pParent->nOverflow==0 || pParent->aOvfl[0].idx==iParentIdx );
5875
5876 if( !aOvflSpace ){
5877 return SQLITE_NOMEM;
5878 }
5879
5880 /* Find the sibling pages to balance. Also locate the cells in pParent
5881 ** that divide the siblings. An attempt is made to find NN siblings on
5882 ** either side of pPage. More siblings are taken from one side, however,
5883 ** if there are fewer than NN siblings on the other side. If pParent
5884 ** has NB or fewer children then all children of pParent are taken.
5885 **
5886 ** This loop also drops the divider cells from the parent page. This
5887 ** way, the remainder of the function does not have to deal with any
5888 ** overflow cells in the parent page, since if any existed they will
5889 ** have already been removed.
5890 */
5891 i = pParent->nOverflow + pParent->nCell;
5892 if( i<2 ){
5893 nxDiv = 0;
5894 nOld = i+1;
5895 }else{
5896 nOld = 3;
5897 if( iParentIdx==0 ){
5898 nxDiv = 0;
5899 }else if( iParentIdx==i ){
5900 nxDiv = i-2;
5901 }else{
5902 nxDiv = iParentIdx-1;
5903 }
5904 i = 2;
5905 }
5906 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
5907 pRight = &pParent->aData[pParent->hdrOffset+8];
5908 }else{
5909 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
5910 }
5911 pgno = get4byte(pRight);
5912 while( 1 ){
5913 rc = getAndInitPage(pBt, pgno, &apOld[i]);
5914 if( rc ){
5915 memset(apOld, 0, (i+1)*sizeof(MemPage*));
5916 goto balance_cleanup;
5917 }
5918 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
5919 if( (i--)==0 ) break;
5920
5921 if( i+nxDiv==pParent->aOvfl[0].idx && pParent->nOverflow ){
5922 apDiv[i] = pParent->aOvfl[0].pCell;
5923 pgno = get4byte(apDiv[i]);
5924 szNew[i] = cellSizePtr(pParent, apDiv[i]);
5925 pParent->nOverflow = 0;
5926 }else{
5927 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
5928 pgno = get4byte(apDiv[i]);
5929 szNew[i] = cellSizePtr(pParent, apDiv[i]);
5930
5931 /* Drop the cell from the parent page. apDiv[i] still points to
5932 ** the cell within the parent, even though it has been dropped.
5933 ** This is safe because dropping a cell only overwrites the first
5934 ** four bytes of it, and this function does not need the first
5935 ** four bytes of the divider cell. So the pointer is safe to use
5936 ** later on.
5937 **
5938 ** Unless SQLite is compiled in secure-delete mode. In this case,
5939 ** the dropCell() routine will overwrite the entire cell with zeroes.
5940 ** In this case, temporarily copy the cell into the aOvflSpace[]
5941 ** buffer. It will be copied out again as soon as the aSpace[] buffer
5942 ** is allocated. */
5943 if( pBt->secureDelete ){
5944 int iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
5945 if( (iOff+szNew[i])>(int)pBt->usableSize ){
5946 rc = SQLITE_CORRUPT_BKPT;
5947 memset(apOld, 0, (i+1)*sizeof(MemPage*));
5948 goto balance_cleanup;
5949 }else{
5950 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
5951 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
5952 }
5953 }
5954 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
5955 }
5956 }
5957
5958 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
5959 ** alignment */
5960 nMaxCells = (nMaxCells + 3)&~3;
5961
5962 /*
5963 ** Allocate space for memory structures
5964 */
5965 k = pBt->pageSize + ROUND8(sizeof(MemPage));
5966 szScratch =
5967 nMaxCells*sizeof(u8*) /* apCell */
5968 + nMaxCells*sizeof(u16) /* szCell */
5969 + pBt->pageSize /* aSpace1 */
5970 + k*nOld; /* Page copies (apCopy) */
5971 apCell = sqlite3ScratchMalloc( szScratch );
5972 if( apCell==0 ){
5973 rc = SQLITE_NOMEM;
5974 goto balance_cleanup;
5975 }
5976 szCell = (u16*)&apCell[nMaxCells];
5977 aSpace1 = (u8*)&szCell[nMaxCells];
5978 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
5979
5980 /*
5981 ** Load pointers to all cells on sibling pages and the divider cells
5982 ** into the local apCell[] array. Make copies of the divider cells
5983 ** into space obtained from aSpace1[] and remove the the divider Cells
5984 ** from pParent.
5985 **
5986 ** If the siblings are on leaf pages, then the child pointers of the
5987 ** divider cells are stripped from the cells before they are copied
5988 ** into aSpace1[]. In this way, all cells in apCell[] are without
5989 ** child pointers. If siblings are not leaves, then all cell in
5990 ** apCell[] include child pointers. Either way, all cells in apCell[]
5991 ** are alike.
5992 **
5993 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
5994 ** leafData: 1 if pPage holds key+data and pParent holds only keys.
5995 */
5996 leafCorrection = apOld[0]->leaf*4;
5997 leafData = apOld[0]->hasData;
5998 for(i=0; i<nOld; i++){
5999 int limit;
6000
6001 /* Before doing anything else, take a copy of the i'th original sibling
6002 ** The rest of this function will use data from the copies rather
6003 ** that the original pages since the original pages will be in the
6004 ** process of being overwritten. */
6005 MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i];
6006 memcpy(pOld, apOld[i], sizeof(MemPage));
6007 pOld->aData = (void*)&pOld[1];
6008 memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize);
6009
6010 limit = pOld->nCell+pOld->nOverflow;
6011 for(j=0; j<limit; j++){
6012 assert( nCell<nMaxCells );
6013 apCell[nCell] = findOverflowCell(pOld, j);
6014 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
6015 nCell++;
6016 }
6017 if( i<nOld-1 && !leafData){
6018 u16 sz = (u16)szNew[i];
6019 u8 *pTemp;
6020 assert( nCell<nMaxCells );
6021 szCell[nCell] = sz;
6022 pTemp = &aSpace1[iSpace1];
6023 iSpace1 += sz;
6024 assert( sz<=pBt->maxLocal+23 );
6025 assert( iSpace1 <= (int)pBt->pageSize );
6026 memcpy(pTemp, apDiv[i], sz);
6027 apCell[nCell] = pTemp+leafCorrection;
6028 assert( leafCorrection==0 || leafCorrection==4 );
6029 szCell[nCell] = szCell[nCell] - leafCorrection;
6030 if( !pOld->leaf ){
6031 assert( leafCorrection==0 );
6032 assert( pOld->hdrOffset==0 );
6033 /* The right pointer of the child page pOld becomes the left
6034 ** pointer of the divider cell */
6035 memcpy(apCell[nCell], &pOld->aData[8], 4);
6036 }else{
6037 assert( leafCorrection==4 );
6038 if( szCell[nCell]<4 ){
6039 /* Do not allow any cells smaller than 4 bytes. */
6040 szCell[nCell] = 4;
6041 }
6042 }
6043 nCell++;
6044 }
6045 }
6046
6047 /*
6048 ** Figure out the number of pages needed to hold all nCell cells.
6049 ** Store this number in "k". Also compute szNew[] which is the total
6050 ** size of all cells on the i-th page and cntNew[] which is the index
6051 ** in apCell[] of the cell that divides page i from page i+1.
6052 ** cntNew[k] should equal nCell.
6053 **
6054 ** Values computed by this block:
6055 **
6056 ** k: The total number of sibling pages
6057 ** szNew[i]: Spaced used on the i-th sibling page.
6058 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
6059 ** the right of the i-th sibling page.
6060 ** usableSpace: Number of bytes of space available on each sibling.
6061 **
6062 */
6063 usableSpace = pBt->usableSize - 12 + leafCorrection;
6064 for(subtotal=k=i=0; i<nCell; i++){
6065 assert( i<nMaxCells );
6066 subtotal += szCell[i] + 2;
6067 if( subtotal > usableSpace ){
6068 szNew[k] = subtotal - szCell[i];
6069 cntNew[k] = i;
6070 if( leafData ){ i--; }
6071 subtotal = 0;
6072 k++;
6073 if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
6074 }
6075 }
6076 szNew[k] = subtotal;
6077 cntNew[k] = nCell;
6078 k++;
6079
6080 /*
6081 ** The packing computed by the previous block is biased toward the siblings
6082 ** on the left side. The left siblings are always nearly full, while the
6083 ** right-most sibling might be nearly empty. This block of code attempts
6084 ** to adjust the packing of siblings to get a better balance.
6085 **
6086 ** This adjustment is more than an optimization. The packing above might
6087 ** be so out of balance as to be illegal. For example, the right-most
6088 ** sibling might be completely empty. This adjustment is not optional.
6089 */
6090 for(i=k-1; i>0; i--){
6091 int szRight = szNew[i]; /* Size of sibling on the right */
6092 int szLeft = szNew[i-1]; /* Size of sibling on the left */
6093 int r; /* Index of right-most cell in left sibling */
6094 int d; /* Index of first cell to the left of right sibling */
6095
6096 r = cntNew[i-1] - 1;
6097 d = r + 1 - leafData;
6098 assert( d<nMaxCells );
6099 assert( r<nMaxCells );
6100 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
6101 szRight += szCell[d] + 2;
6102 szLeft -= szCell[r] + 2;
6103 cntNew[i-1]--;
6104 r = cntNew[i-1] - 1;
6105 d = r + 1 - leafData;
6106 }
6107 szNew[i] = szRight;
6108 szNew[i-1] = szLeft;
6109 }
6110
6111 /* Either we found one or more cells (cntnew[0])>0) or pPage is
6112 ** a virtual root page. A virtual root page is when the real root
6113 ** page is page 1 and we are the only child of that page.
6114 */
6115 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
6116
6117 TRACE(("BALANCE: old: %d %d %d ",
6118 apOld[0]->pgno,
6119 nOld>=2 ? apOld[1]->pgno : 0,
6120 nOld>=3 ? apOld[2]->pgno : 0
6121 ));
6122
6123 /*
6124 ** Allocate k new pages. Reuse old pages where possible.
6125 */
6126 if( apOld[0]->pgno<=1 ){
6127 rc = SQLITE_CORRUPT_BKPT;
6128 goto balance_cleanup;
6129 }
6130 pageFlags = apOld[0]->aData[0];
6131 for(i=0; i<k; i++){
6132 MemPage *pNew;
6133 if( i<nOld ){
6134 pNew = apNew[i] = apOld[i];
6135 apOld[i] = 0;
6136 rc = sqlite3PagerWrite(pNew->pDbPage);
6137 nNew++;
6138 if( rc ) goto balance_cleanup;
6139 }else{
6140 assert( i>0 );
6141 rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0);
6142 if( rc ) goto balance_cleanup;
6143 apNew[i] = pNew;
6144 nNew++;
6145
6146 /* Set the pointer-map entry for the new sibling page. */
6147 if( ISAUTOVACUUM ){
6148 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
6149 if( rc!=SQLITE_OK ){
6150 goto balance_cleanup;
6151 }
6152 }
6153 }
6154 }
6155
6156 /* Free any old pages that were not reused as new pages.
6157 */
6158 while( i<nOld ){
6159 freePage(apOld[i], &rc);
6160 if( rc ) goto balance_cleanup;
6161 releasePage(apOld[i]);
6162 apOld[i] = 0;
6163 i++;
6164 }
6165
6166 /*
6167 ** Put the new pages in accending order. This helps to
6168 ** keep entries in the disk file in order so that a scan
6169 ** of the table is a linear scan through the file. That
6170 ** in turn helps the operating system to deliver pages
6171 ** from the disk more rapidly.
6172 **
6173 ** An O(n^2) insertion sort algorithm is used, but since
6174 ** n is never more than NB (a small constant), that should
6175 ** not be a problem.
6176 **
6177 ** When NB==3, this one optimization makes the database
6178 ** about 25% faster for large insertions and deletions.
6179 */
6180 for(i=0; i<k-1; i++){
6181 int minV = apNew[i]->pgno;
6182 int minI = i;
6183 for(j=i+1; j<k; j++){
6184 if( apNew[j]->pgno<(unsigned)minV ){
6185 minI = j;
6186 minV = apNew[j]->pgno;
6187 }
6188 }
6189 if( minI>i ){
6190 MemPage *pT;
6191 pT = apNew[i];
6192 apNew[i] = apNew[minI];
6193 apNew[minI] = pT;
6194 }
6195 }
6196 TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
6197 apNew[0]->pgno, szNew[0],
6198 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
6199 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
6200 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
6201 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));
6202
6203 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6204 put4byte(pRight, apNew[nNew-1]->pgno);
6205
6206 /*
6207 ** Evenly distribute the data in apCell[] across the new pages.
6208 ** Insert divider cells into pParent as necessary.
6209 */
6210 j = 0;
6211 for(i=0; i<nNew; i++){
6212 /* Assemble the new sibling page. */
6213 MemPage *pNew = apNew[i];
6214 assert( j<nMaxCells );
6215 zeroPage(pNew, pageFlags);
6216 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
6217 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
6218 assert( pNew->nOverflow==0 );
6219
6220 j = cntNew[i];
6221
6222 /* If the sibling page assembled above was not the right-most sibling,
6223 ** insert a divider cell into the parent page.
6224 */
6225 assert( i<nNew-1 || j==nCell );
6226 if( j<nCell ){
6227 u8 *pCell;
6228 u8 *pTemp;
6229 int sz;
6230
6231 assert( j<nMaxCells );
6232 pCell = apCell[j];
6233 sz = szCell[j] + leafCorrection;
6234 pTemp = &aOvflSpace[iOvflSpace];
6235 if( !pNew->leaf ){
6236 memcpy(&pNew->aData[8], pCell, 4);
6237 }else if( leafData ){
6238 /* If the tree is a leaf-data tree, and the siblings are leaves,
6239 ** then there is no divider cell in apCell[]. Instead, the divider
6240 ** cell consists of the integer key for the right-most cell of
6241 ** the sibling-page assembled above only.
6242 */
6243 CellInfo info;
6244 j--;
6245 btreeParseCellPtr(pNew, apCell[j], &info);
6246 pCell = pTemp;
6247 sz = 4 + putVarint(&pCell[4], info.nKey);
6248 pTemp = 0;
6249 }else{
6250 pCell -= 4;
6251 /* Obscure case for non-leaf-data trees: If the cell at pCell was
6252 ** previously stored on a leaf node, and its reported size was 4
6253 ** bytes, then it may actually be smaller than this
6254 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
6255 ** any cell). But it is important to pass the correct size to
6256 ** insertCell(), so reparse the cell now.
6257 **
6258 ** Note that this can never happen in an SQLite data file, as all
6259 ** cells are at least 4 bytes. It only happens in b-trees used
6260 ** to evaluate "IN (SELECT ...)" and similar clauses.
6261 */
6262 if( szCell[j]==4 ){
6263 assert(leafCorrection==4);
6264 sz = cellSizePtr(pParent, pCell);
6265 }
6266 }
6267 iOvflSpace += sz;
6268 assert( sz<=pBt->maxLocal+23 );
6269 assert( iOvflSpace <= (int)pBt->pageSize );
6270 insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);
6271 if( rc!=SQLITE_OK ) goto balance_cleanup;
6272 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6273
6274 j++;
6275 nxDiv++;
6276 }
6277 }
6278 assert( j==nCell );
6279 assert( nOld>0 );
6280 assert( nNew>0 );
6281 if( (pageFlags & PTF_LEAF)==0 ){
6282 u8 *zChild = &apCopy[nOld-1]->aData[8];
6283 memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
6284 }
6285
6286 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
6287 /* The root page of the b-tree now contains no cells. The only sibling
6288 ** page is the right-child of the parent. Copy the contents of the
6289 ** child page into the parent, decreasing the overall height of the
6290 ** b-tree structure by one. This is described as the "balance-shallower"
6291 ** sub-algorithm in some documentation.
6292 **
6293 ** If this is an auto-vacuum database, the call to copyNodeContent()
6294 ** sets all pointer-map entries corresponding to database image pages
6295 ** for which the pointer is stored within the content being copied.
6296 **
6297 ** The second assert below verifies that the child page is defragmented
6298 ** (it must be, as it was just reconstructed using assemblePage()). This
6299 ** is important if the parent page happens to be page 1 of the database
6300 ** image. */
6301 assert( nNew==1 );
6302 assert( apNew[0]->nFree ==
6303 (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
6304 );
6305 copyNodeContent(apNew[0], pParent, &rc);
6306 freePage(apNew[0], &rc);
6307 }else if( ISAUTOVACUUM ){
6308 /* Fix the pointer-map entries for all the cells that were shifted around.
6309 ** There are several different types of pointer-map entries that need to
6310 ** be dealt with by this routine. Some of these have been set already, but
6311 ** many have not. The following is a summary:
6312 **
6313 ** 1) The entries associated with new sibling pages that were not
6314 ** siblings when this function was called. These have already
6315 ** been set. We don't need to worry about old siblings that were
6316 ** moved to the free-list - the freePage() code has taken care
6317 ** of those.
6318 **
6319 ** 2) The pointer-map entries associated with the first overflow
6320 ** page in any overflow chains used by new divider cells. These
6321 ** have also already been taken care of by the insertCell() code.
6322 **
6323 ** 3) If the sibling pages are not leaves, then the child pages of
6324 ** cells stored on the sibling pages may need to be updated.
6325 **
6326 ** 4) If the sibling pages are not internal intkey nodes, then any
6327 ** overflow pages used by these cells may need to be updated
6328 ** (internal intkey nodes never contain pointers to overflow pages).
6329 **
6330 ** 5) If the sibling pages are not leaves, then the pointer-map
6331 ** entries for the right-child pages of each sibling may need
6332 ** to be updated.
6333 **
6334 ** Cases 1 and 2 are dealt with above by other code. The next
6335 ** block deals with cases 3 and 4 and the one after that, case 5. Since
6336 ** setting a pointer map entry is a relatively expensive operation, this
6337 ** code only sets pointer map entries for child or overflow pages that have
6338 ** actually moved between pages. */
6339 MemPage *pNew = apNew[0];
6340 MemPage *pOld = apCopy[0];
6341 int nOverflow = pOld->nOverflow;
6342 int iNextOld = pOld->nCell + nOverflow;
6343 int iOverflow = (nOverflow ? pOld->aOvfl[0].idx : -1);
6344 j = 0; /* Current 'old' sibling page */
6345 k = 0; /* Current 'new' sibling page */
6346 for(i=0; i<nCell; i++){
6347 int isDivider = 0;
6348 while( i==iNextOld ){
6349 /* Cell i is the cell immediately following the last cell on old
6350 ** sibling page j. If the siblings are not leaf pages of an
6351 ** intkey b-tree, then cell i was a divider cell. */
6352 pOld = apCopy[++j];
6353 iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;
6354 if( pOld->nOverflow ){
6355 nOverflow = pOld->nOverflow;
6356 iOverflow = i + !leafData + pOld->aOvfl[0].idx;
6357 }
6358 isDivider = !leafData;
6359 }
6360
6361 assert(nOverflow>0 || iOverflow<i );
6362 assert(nOverflow<2 || pOld->aOvfl[0].idx==pOld->aOvfl[1].idx-1);
6363 assert(nOverflow<3 || pOld->aOvfl[1].idx==pOld->aOvfl[2].idx-1);
6364 if( i==iOverflow ){
6365 isDivider = 1;
6366 if( (--nOverflow)>0 ){
6367 iOverflow++;
6368 }
6369 }
6370
6371 if( i==cntNew[k] ){
6372 /* Cell i is the cell immediately following the last cell on new
6373 ** sibling page k. If the siblings are not leaf pages of an
6374 ** intkey b-tree, then cell i is a divider cell. */
6375 pNew = apNew[++k];
6376 if( !leafData ) continue;
6377 }
6378 assert( j<nOld );
6379 assert( k<nNew );
6380
6381 /* If the cell was originally divider cell (and is not now) or
6382 ** an overflow cell, or if the cell was located on a different sibling
6383 ** page before the balancing, then the pointer map entries associated
6384 ** with any child or overflow pages need to be updated. */
6385 if( isDivider || pOld->pgno!=pNew->pgno ){
6386 if( !leafCorrection ){
6387 ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);
6388 }
6389 if( szCell[i]>pNew->minLocal ){
6390 ptrmapPutOvflPtr(pNew, apCell[i], &rc);
6391 }
6392 }
6393 }
6394
6395 if( !leafCorrection ){
6396 for(i=0; i<nNew; i++){
6397 u32 key = get4byte(&apNew[i]->aData[8]);
6398 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
6399 }
6400 }
6401
6402 #if 0
6403 /* The ptrmapCheckPages() contains assert() statements that verify that
6404 ** all pointer map pages are set correctly. This is helpful while
6405 ** debugging. This is usually disabled because a corrupt database may
6406 ** cause an assert() statement to fail. */
6407 ptrmapCheckPages(apNew, nNew);
6408 ptrmapCheckPages(&pParent, 1);
6409 #endif
6410 }
6411
6412 assert( pParent->isInit );
6413 TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
6414 nOld, nNew, nCell));
6415
6416 /*
6417 ** Cleanup before returning.
6418 */
6419 balance_cleanup:
6420 sqlite3ScratchFree(apCell);
6421 for(i=0; i<nOld; i++){
6422 releasePage(apOld[i]);
6423 }
6424 for(i=0; i<nNew; i++){
6425 releasePage(apNew[i]);
6426 }
6427
6428 return rc;
6429 }
6430
6431
6432 /*
6433 ** This function is called when the root page of a b-tree structure is
6434 ** overfull (has one or more overflow pages).
6435 **
6436 ** A new child page is allocated and the contents of the current root
6437 ** page, including overflow cells, are copied into the child. The root
6438 ** page is then overwritten to make it an empty page with the right-child
6439 ** pointer pointing to the new page.
6440 **
6441 ** Before returning, all pointer-map entries corresponding to pages
6442 ** that the new child-page now contains pointers to are updated. The
6443 ** entry corresponding to the new right-child pointer of the root
6444 ** page is also updated.
6445 **
6446 ** If successful, *ppChild is set to contain a reference to the child
6447 ** page and SQLITE_OK is returned. In this case the caller is required
6448 ** to call releasePage() on *ppChild exactly once. If an error occurs,
6449 ** an error code is returned and *ppChild is set to 0.
6450 */
balance_deeper(MemPage * pRoot,MemPage ** ppChild)6451 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
6452 int rc; /* Return value from subprocedures */
6453 MemPage *pChild = 0; /* Pointer to a new child page */
6454 Pgno pgnoChild = 0; /* Page number of the new child page */
6455 BtShared *pBt = pRoot->pBt; /* The BTree */
6456
6457 assert( pRoot->nOverflow>0 );
6458 assert( sqlite3_mutex_held(pBt->mutex) );
6459
6460 /* Make pRoot, the root page of the b-tree, writable. Allocate a new
6461 ** page that will become the new right-child of pPage. Copy the contents
6462 ** of the node stored on pRoot into the new child page.
6463 */
6464 rc = sqlite3PagerWrite(pRoot->pDbPage);
6465 if( rc==SQLITE_OK ){
6466 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
6467 copyNodeContent(pRoot, pChild, &rc);
6468 if( ISAUTOVACUUM ){
6469 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
6470 }
6471 }
6472 if( rc ){
6473 *ppChild = 0;
6474 releasePage(pChild);
6475 return rc;
6476 }
6477 assert( sqlite3PagerIswriteable(pChild->pDbPage) );
6478 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
6479 assert( pChild->nCell==pRoot->nCell );
6480
6481 TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
6482
6483 /* Copy the overflow cells from pRoot to pChild */
6484 memcpy(pChild->aOvfl, pRoot->aOvfl, pRoot->nOverflow*sizeof(pRoot->aOvfl[0]));
6485 pChild->nOverflow = pRoot->nOverflow;
6486
6487 /* Zero the contents of pRoot. Then install pChild as the right-child. */
6488 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
6489 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
6490
6491 *ppChild = pChild;
6492 return SQLITE_OK;
6493 }
6494
6495 /*
6496 ** The page that pCur currently points to has just been modified in
6497 ** some way. This function figures out if this modification means the
6498 ** tree needs to be balanced, and if so calls the appropriate balancing
6499 ** routine. Balancing routines are:
6500 **
6501 ** balance_quick()
6502 ** balance_deeper()
6503 ** balance_nonroot()
6504 */
balance(BtCursor * pCur)6505 static int balance(BtCursor *pCur){
6506 int rc = SQLITE_OK;
6507 const int nMin = pCur->pBt->usableSize * 2 / 3;
6508 u8 aBalanceQuickSpace[13];
6509 u8 *pFree = 0;
6510
6511 TESTONLY( int balance_quick_called = 0 );
6512 TESTONLY( int balance_deeper_called = 0 );
6513
6514 do {
6515 int iPage = pCur->iPage;
6516 MemPage *pPage = pCur->apPage[iPage];
6517
6518 if( iPage==0 ){
6519 if( pPage->nOverflow ){
6520 /* The root page of the b-tree is overfull. In this case call the
6521 ** balance_deeper() function to create a new child for the root-page
6522 ** and copy the current contents of the root-page to it. The
6523 ** next iteration of the do-loop will balance the child page.
6524 */
6525 assert( (balance_deeper_called++)==0 );
6526 rc = balance_deeper(pPage, &pCur->apPage[1]);
6527 if( rc==SQLITE_OK ){
6528 pCur->iPage = 1;
6529 pCur->aiIdx[0] = 0;
6530 pCur->aiIdx[1] = 0;
6531 assert( pCur->apPage[1]->nOverflow );
6532 }
6533 }else{
6534 break;
6535 }
6536 }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
6537 break;
6538 }else{
6539 MemPage * const pParent = pCur->apPage[iPage-1];
6540 int const iIdx = pCur->aiIdx[iPage-1];
6541
6542 rc = sqlite3PagerWrite(pParent->pDbPage);
6543 if( rc==SQLITE_OK ){
6544 #ifndef SQLITE_OMIT_QUICKBALANCE
6545 if( pPage->hasData
6546 && pPage->nOverflow==1
6547 && pPage->aOvfl[0].idx==pPage->nCell
6548 && pParent->pgno!=1
6549 && pParent->nCell==iIdx
6550 ){
6551 /* Call balance_quick() to create a new sibling of pPage on which
6552 ** to store the overflow cell. balance_quick() inserts a new cell
6553 ** into pParent, which may cause pParent overflow. If this
6554 ** happens, the next interation of the do-loop will balance pParent
6555 ** use either balance_nonroot() or balance_deeper(). Until this
6556 ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
6557 ** buffer.
6558 **
6559 ** The purpose of the following assert() is to check that only a
6560 ** single call to balance_quick() is made for each call to this
6561 ** function. If this were not verified, a subtle bug involving reuse
6562 ** of the aBalanceQuickSpace[] might sneak in.
6563 */
6564 assert( (balance_quick_called++)==0 );
6565 rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
6566 }else
6567 #endif
6568 {
6569 /* In this case, call balance_nonroot() to redistribute cells
6570 ** between pPage and up to 2 of its sibling pages. This involves
6571 ** modifying the contents of pParent, which may cause pParent to
6572 ** become overfull or underfull. The next iteration of the do-loop
6573 ** will balance the parent page to correct this.
6574 **
6575 ** If the parent page becomes overfull, the overflow cell or cells
6576 ** are stored in the pSpace buffer allocated immediately below.
6577 ** A subsequent iteration of the do-loop will deal with this by
6578 ** calling balance_nonroot() (balance_deeper() may be called first,
6579 ** but it doesn't deal with overflow cells - just moves them to a
6580 ** different page). Once this subsequent call to balance_nonroot()
6581 ** has completed, it is safe to release the pSpace buffer used by
6582 ** the previous call, as the overflow cell data will have been
6583 ** copied either into the body of a database page or into the new
6584 ** pSpace buffer passed to the latter call to balance_nonroot().
6585 */
6586 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
6587 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1);
6588 if( pFree ){
6589 /* If pFree is not NULL, it points to the pSpace buffer used
6590 ** by a previous call to balance_nonroot(). Its contents are
6591 ** now stored either on real database pages or within the
6592 ** new pSpace buffer, so it may be safely freed here. */
6593 sqlite3PageFree(pFree);
6594 }
6595
6596 /* The pSpace buffer will be freed after the next call to
6597 ** balance_nonroot(), or just before this function returns, whichever
6598 ** comes first. */
6599 pFree = pSpace;
6600 }
6601 }
6602
6603 pPage->nOverflow = 0;
6604
6605 /* The next iteration of the do-loop balances the parent page. */
6606 releasePage(pPage);
6607 pCur->iPage--;
6608 }
6609 }while( rc==SQLITE_OK );
6610
6611 if( pFree ){
6612 sqlite3PageFree(pFree);
6613 }
6614 return rc;
6615 }
6616
6617
6618 /*
6619 ** Insert a new record into the BTree. The key is given by (pKey,nKey)
6620 ** and the data is given by (pData,nData). The cursor is used only to
6621 ** define what table the record should be inserted into. The cursor
6622 ** is left pointing at a random location.
6623 **
6624 ** For an INTKEY table, only the nKey value of the key is used. pKey is
6625 ** ignored. For a ZERODATA table, the pData and nData are both ignored.
6626 **
6627 ** If the seekResult parameter is non-zero, then a successful call to
6628 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already
6629 ** been performed. seekResult is the search result returned (a negative
6630 ** number if pCur points at an entry that is smaller than (pKey, nKey), or
6631 ** a positive value if pCur points at an etry that is larger than
6632 ** (pKey, nKey)).
6633 **
6634 ** If the seekResult parameter is non-zero, then the caller guarantees that
6635 ** cursor pCur is pointing at the existing copy of a row that is to be
6636 ** overwritten. If the seekResult parameter is 0, then cursor pCur may
6637 ** point to any entry or to no entry at all and so this function has to seek
6638 ** the cursor before the new key can be inserted.
6639 */
sqlite3BtreeInsert(BtCursor * pCur,const void * pKey,i64 nKey,const void * pData,int nData,int nZero,int appendBias,int seekResult)6640 int sqlite3BtreeInsert(
6641 BtCursor *pCur, /* Insert data into the table of this cursor */
6642 const void *pKey, i64 nKey, /* The key of the new record */
6643 const void *pData, int nData, /* The data of the new record */
6644 int nZero, /* Number of extra 0 bytes to append to data */
6645 int appendBias, /* True if this is likely an append */
6646 int seekResult /* Result of prior MovetoUnpacked() call */
6647 ){
6648 int rc;
6649 int loc = seekResult; /* -1: before desired location +1: after */
6650 int szNew = 0;
6651 int idx;
6652 MemPage *pPage;
6653 Btree *p = pCur->pBtree;
6654 BtShared *pBt = p->pBt;
6655 unsigned char *oldCell;
6656 unsigned char *newCell = 0;
6657
6658 if( pCur->eState==CURSOR_FAULT ){
6659 assert( pCur->skipNext!=SQLITE_OK );
6660 return pCur->skipNext;
6661 }
6662
6663 assert( cursorHoldsMutex(pCur) );
6664 assert( pCur->wrFlag && pBt->inTransaction==TRANS_WRITE && !pBt->readOnly );
6665 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6666
6667 /* Assert that the caller has been consistent. If this cursor was opened
6668 ** expecting an index b-tree, then the caller should be inserting blob
6669 ** keys with no associated data. If the cursor was opened expecting an
6670 ** intkey table, the caller should be inserting integer keys with a
6671 ** blob of associated data. */
6672 assert( (pKey==0)==(pCur->pKeyInfo==0) );
6673
6674 /* If this is an insert into a table b-tree, invalidate any incrblob
6675 ** cursors open on the row being replaced (assuming this is a replace
6676 ** operation - if it is not, the following is a no-op). */
6677 if( pCur->pKeyInfo==0 ){
6678 invalidateIncrblobCursors(p, nKey, 0);
6679 }
6680
6681 /* Save the positions of any other cursors open on this table.
6682 **
6683 ** In some cases, the call to btreeMoveto() below is a no-op. For
6684 ** example, when inserting data into a table with auto-generated integer
6685 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
6686 ** integer key to use. It then calls this function to actually insert the
6687 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
6688 ** that the cursor is already where it needs to be and returns without
6689 ** doing any work. To avoid thwarting these optimizations, it is important
6690 ** not to clear the cursor here.
6691 */
6692 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
6693 if( rc ) return rc;
6694 if( !loc ){
6695 rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc);
6696 if( rc ) return rc;
6697 }
6698 assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
6699
6700 pPage = pCur->apPage[pCur->iPage];
6701 assert( pPage->intKey || nKey>=0 );
6702 assert( pPage->leaf || !pPage->intKey );
6703
6704 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
6705 pCur->pgnoRoot, nKey, nData, pPage->pgno,
6706 loc==0 ? "overwrite" : "new entry"));
6707 assert( pPage->isInit );
6708 allocateTempSpace(pBt);
6709 newCell = pBt->pTmpSpace;
6710 if( newCell==0 ) return SQLITE_NOMEM;
6711 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
6712 if( rc ) goto end_insert;
6713 assert( szNew==cellSizePtr(pPage, newCell) );
6714 assert( szNew <= MX_CELL_SIZE(pBt) );
6715 idx = pCur->aiIdx[pCur->iPage];
6716 if( loc==0 ){
6717 u16 szOld;
6718 assert( idx<pPage->nCell );
6719 rc = sqlite3PagerWrite(pPage->pDbPage);
6720 if( rc ){
6721 goto end_insert;
6722 }
6723 oldCell = findCell(pPage, idx);
6724 if( !pPage->leaf ){
6725 memcpy(newCell, oldCell, 4);
6726 }
6727 szOld = cellSizePtr(pPage, oldCell);
6728 rc = clearCell(pPage, oldCell);
6729 dropCell(pPage, idx, szOld, &rc);
6730 if( rc ) goto end_insert;
6731 }else if( loc<0 && pPage->nCell>0 ){
6732 assert( pPage->leaf );
6733 idx = ++pCur->aiIdx[pCur->iPage];
6734 }else{
6735 assert( pPage->leaf );
6736 }
6737 insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
6738 assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
6739
6740 /* If no error has occured and pPage has an overflow cell, call balance()
6741 ** to redistribute the cells within the tree. Since balance() may move
6742 ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey
6743 ** variables.
6744 **
6745 ** Previous versions of SQLite called moveToRoot() to move the cursor
6746 ** back to the root page as balance() used to invalidate the contents
6747 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
6748 ** set the cursor state to "invalid". This makes common insert operations
6749 ** slightly faster.
6750 **
6751 ** There is a subtle but important optimization here too. When inserting
6752 ** multiple records into an intkey b-tree using a single cursor (as can
6753 ** happen while processing an "INSERT INTO ... SELECT" statement), it
6754 ** is advantageous to leave the cursor pointing to the last entry in
6755 ** the b-tree if possible. If the cursor is left pointing to the last
6756 ** entry in the table, and the next row inserted has an integer key
6757 ** larger than the largest existing key, it is possible to insert the
6758 ** row without seeking the cursor. This can be a big performance boost.
6759 */
6760 pCur->info.nSize = 0;
6761 pCur->validNKey = 0;
6762 if( rc==SQLITE_OK && pPage->nOverflow ){
6763 rc = balance(pCur);
6764
6765 /* Must make sure nOverflow is reset to zero even if the balance()
6766 ** fails. Internal data structure corruption will result otherwise.
6767 ** Also, set the cursor state to invalid. This stops saveCursorPosition()
6768 ** from trying to save the current position of the cursor. */
6769 pCur->apPage[pCur->iPage]->nOverflow = 0;
6770 pCur->eState = CURSOR_INVALID;
6771 }
6772 assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
6773
6774 end_insert:
6775 return rc;
6776 }
6777
6778 /*
6779 ** Delete the entry that the cursor is pointing to. The cursor
6780 ** is left pointing at a arbitrary location.
6781 */
sqlite3BtreeDelete(BtCursor * pCur)6782 int sqlite3BtreeDelete(BtCursor *pCur){
6783 Btree *p = pCur->pBtree;
6784 BtShared *pBt = p->pBt;
6785 int rc; /* Return code */
6786 MemPage *pPage; /* Page to delete cell from */
6787 unsigned char *pCell; /* Pointer to cell to delete */
6788 int iCellIdx; /* Index of cell to delete */
6789 int iCellDepth; /* Depth of node containing pCell */
6790
6791 assert( cursorHoldsMutex(pCur) );
6792 assert( pBt->inTransaction==TRANS_WRITE );
6793 assert( !pBt->readOnly );
6794 assert( pCur->wrFlag );
6795 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6796 assert( !hasReadConflicts(p, pCur->pgnoRoot) );
6797
6798 if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell)
6799 || NEVER(pCur->eState!=CURSOR_VALID)
6800 ){
6801 return SQLITE_ERROR; /* Something has gone awry. */
6802 }
6803
6804 /* If this is a delete operation to remove a row from a table b-tree,
6805 ** invalidate any incrblob cursors open on the row being deleted. */
6806 if( pCur->pKeyInfo==0 ){
6807 invalidateIncrblobCursors(p, pCur->info.nKey, 0);
6808 }
6809
6810 iCellDepth = pCur->iPage;
6811 iCellIdx = pCur->aiIdx[iCellDepth];
6812 pPage = pCur->apPage[iCellDepth];
6813 pCell = findCell(pPage, iCellIdx);
6814
6815 /* If the page containing the entry to delete is not a leaf page, move
6816 ** the cursor to the largest entry in the tree that is smaller than
6817 ** the entry being deleted. This cell will replace the cell being deleted
6818 ** from the internal node. The 'previous' entry is used for this instead
6819 ** of the 'next' entry, as the previous entry is always a part of the
6820 ** sub-tree headed by the child page of the cell being deleted. This makes
6821 ** balancing the tree following the delete operation easier. */
6822 if( !pPage->leaf ){
6823 int notUsed;
6824 rc = sqlite3BtreePrevious(pCur, ¬Used);
6825 if( rc ) return rc;
6826 }
6827
6828 /* Save the positions of any other cursors open on this table before
6829 ** making any modifications. Make the page containing the entry to be
6830 ** deleted writable. Then free any overflow pages associated with the
6831 ** entry and finally remove the cell itself from within the page.
6832 */
6833 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
6834 if( rc ) return rc;
6835 rc = sqlite3PagerWrite(pPage->pDbPage);
6836 if( rc ) return rc;
6837 rc = clearCell(pPage, pCell);
6838 dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc);
6839 if( rc ) return rc;
6840
6841 /* If the cell deleted was not located on a leaf page, then the cursor
6842 ** is currently pointing to the largest entry in the sub-tree headed
6843 ** by the child-page of the cell that was just deleted from an internal
6844 ** node. The cell from the leaf node needs to be moved to the internal
6845 ** node to replace the deleted cell. */
6846 if( !pPage->leaf ){
6847 MemPage *pLeaf = pCur->apPage[pCur->iPage];
6848 int nCell;
6849 Pgno n = pCur->apPage[iCellDepth+1]->pgno;
6850 unsigned char *pTmp;
6851
6852 pCell = findCell(pLeaf, pLeaf->nCell-1);
6853 nCell = cellSizePtr(pLeaf, pCell);
6854 assert( MX_CELL_SIZE(pBt) >= nCell );
6855
6856 allocateTempSpace(pBt);
6857 pTmp = pBt->pTmpSpace;
6858
6859 rc = sqlite3PagerWrite(pLeaf->pDbPage);
6860 insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
6861 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
6862 if( rc ) return rc;
6863 }
6864
6865 /* Balance the tree. If the entry deleted was located on a leaf page,
6866 ** then the cursor still points to that page. In this case the first
6867 ** call to balance() repairs the tree, and the if(...) condition is
6868 ** never true.
6869 **
6870 ** Otherwise, if the entry deleted was on an internal node page, then
6871 ** pCur is pointing to the leaf page from which a cell was removed to
6872 ** replace the cell deleted from the internal node. This is slightly
6873 ** tricky as the leaf node may be underfull, and the internal node may
6874 ** be either under or overfull. In this case run the balancing algorithm
6875 ** on the leaf node first. If the balance proceeds far enough up the
6876 ** tree that we can be sure that any problem in the internal node has
6877 ** been corrected, so be it. Otherwise, after balancing the leaf node,
6878 ** walk the cursor up the tree to the internal node and balance it as
6879 ** well. */
6880 rc = balance(pCur);
6881 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
6882 while( pCur->iPage>iCellDepth ){
6883 releasePage(pCur->apPage[pCur->iPage--]);
6884 }
6885 rc = balance(pCur);
6886 }
6887
6888 if( rc==SQLITE_OK ){
6889 moveToRoot(pCur);
6890 }
6891 return rc;
6892 }
6893
6894 /*
6895 ** Create a new BTree table. Write into *piTable the page
6896 ** number for the root page of the new table.
6897 **
6898 ** The type of type is determined by the flags parameter. Only the
6899 ** following values of flags are currently in use. Other values for
6900 ** flags might not work:
6901 **
6902 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
6903 ** BTREE_ZERODATA Used for SQL indices
6904 */
btreeCreateTable(Btree * p,int * piTable,int createTabFlags)6905 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
6906 BtShared *pBt = p->pBt;
6907 MemPage *pRoot;
6908 Pgno pgnoRoot;
6909 int rc;
6910 int ptfFlags; /* Page-type flage for the root page of new table */
6911
6912 assert( sqlite3BtreeHoldsMutex(p) );
6913 assert( pBt->inTransaction==TRANS_WRITE );
6914 assert( !pBt->readOnly );
6915
6916 #ifdef SQLITE_OMIT_AUTOVACUUM
6917 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
6918 if( rc ){
6919 return rc;
6920 }
6921 #else
6922 if( pBt->autoVacuum ){
6923 Pgno pgnoMove; /* Move a page here to make room for the root-page */
6924 MemPage *pPageMove; /* The page to move to. */
6925
6926 /* Creating a new table may probably require moving an existing database
6927 ** to make room for the new tables root page. In case this page turns
6928 ** out to be an overflow page, delete all overflow page-map caches
6929 ** held by open cursors.
6930 */
6931 invalidateAllOverflowCache(pBt);
6932
6933 /* Read the value of meta[3] from the database to determine where the
6934 ** root page of the new table should go. meta[3] is the largest root-page
6935 ** created so far, so the new root-page is (meta[3]+1).
6936 */
6937 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
6938 pgnoRoot++;
6939
6940 /* The new root-page may not be allocated on a pointer-map page, or the
6941 ** PENDING_BYTE page.
6942 */
6943 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
6944 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
6945 pgnoRoot++;
6946 }
6947 assert( pgnoRoot>=3 );
6948
6949 /* Allocate a page. The page that currently resides at pgnoRoot will
6950 ** be moved to the allocated page (unless the allocated page happens
6951 ** to reside at pgnoRoot).
6952 */
6953 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
6954 if( rc!=SQLITE_OK ){
6955 return rc;
6956 }
6957
6958 if( pgnoMove!=pgnoRoot ){
6959 /* pgnoRoot is the page that will be used for the root-page of
6960 ** the new table (assuming an error did not occur). But we were
6961 ** allocated pgnoMove. If required (i.e. if it was not allocated
6962 ** by extending the file), the current page at position pgnoMove
6963 ** is already journaled.
6964 */
6965 u8 eType = 0;
6966 Pgno iPtrPage = 0;
6967
6968 releasePage(pPageMove);
6969
6970 /* Move the page currently at pgnoRoot to pgnoMove. */
6971 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6972 if( rc!=SQLITE_OK ){
6973 return rc;
6974 }
6975 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
6976 if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
6977 rc = SQLITE_CORRUPT_BKPT;
6978 }
6979 if( rc!=SQLITE_OK ){
6980 releasePage(pRoot);
6981 return rc;
6982 }
6983 assert( eType!=PTRMAP_ROOTPAGE );
6984 assert( eType!=PTRMAP_FREEPAGE );
6985 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
6986 releasePage(pRoot);
6987
6988 /* Obtain the page at pgnoRoot */
6989 if( rc!=SQLITE_OK ){
6990 return rc;
6991 }
6992 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6993 if( rc!=SQLITE_OK ){
6994 return rc;
6995 }
6996 rc = sqlite3PagerWrite(pRoot->pDbPage);
6997 if( rc!=SQLITE_OK ){
6998 releasePage(pRoot);
6999 return rc;
7000 }
7001 }else{
7002 pRoot = pPageMove;
7003 }
7004
7005 /* Update the pointer-map and meta-data with the new root-page number. */
7006 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
7007 if( rc ){
7008 releasePage(pRoot);
7009 return rc;
7010 }
7011
7012 /* When the new root page was allocated, page 1 was made writable in
7013 ** order either to increase the database filesize, or to decrement the
7014 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
7015 */
7016 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
7017 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
7018 if( NEVER(rc) ){
7019 releasePage(pRoot);
7020 return rc;
7021 }
7022
7023 }else{
7024 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
7025 if( rc ) return rc;
7026 }
7027 #endif
7028 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
7029 if( createTabFlags & BTREE_INTKEY ){
7030 ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
7031 }else{
7032 ptfFlags = PTF_ZERODATA | PTF_LEAF;
7033 }
7034 zeroPage(pRoot, ptfFlags);
7035 sqlite3PagerUnref(pRoot->pDbPage);
7036 assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
7037 *piTable = (int)pgnoRoot;
7038 return SQLITE_OK;
7039 }
sqlite3BtreeCreateTable(Btree * p,int * piTable,int flags)7040 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
7041 int rc;
7042 sqlite3BtreeEnter(p);
7043 rc = btreeCreateTable(p, piTable, flags);
7044 sqlite3BtreeLeave(p);
7045 return rc;
7046 }
7047
7048 /*
7049 ** Erase the given database page and all its children. Return
7050 ** the page to the freelist.
7051 */
clearDatabasePage(BtShared * pBt,Pgno pgno,int freePageFlag,int * pnChange)7052 static int clearDatabasePage(
7053 BtShared *pBt, /* The BTree that contains the table */
7054 Pgno pgno, /* Page number to clear */
7055 int freePageFlag, /* Deallocate page if true */
7056 int *pnChange /* Add number of Cells freed to this counter */
7057 ){
7058 MemPage *pPage;
7059 int rc;
7060 unsigned char *pCell;
7061 int i;
7062
7063 assert( sqlite3_mutex_held(pBt->mutex) );
7064 if( pgno>btreePagecount(pBt) ){
7065 return SQLITE_CORRUPT_BKPT;
7066 }
7067
7068 rc = getAndInitPage(pBt, pgno, &pPage);
7069 if( rc ) return rc;
7070 for(i=0; i<pPage->nCell; i++){
7071 pCell = findCell(pPage, i);
7072 if( !pPage->leaf ){
7073 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
7074 if( rc ) goto cleardatabasepage_out;
7075 }
7076 rc = clearCell(pPage, pCell);
7077 if( rc ) goto cleardatabasepage_out;
7078 }
7079 if( !pPage->leaf ){
7080 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange);
7081 if( rc ) goto cleardatabasepage_out;
7082 }else if( pnChange ){
7083 assert( pPage->intKey );
7084 *pnChange += pPage->nCell;
7085 }
7086 if( freePageFlag ){
7087 freePage(pPage, &rc);
7088 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
7089 zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
7090 }
7091
7092 cleardatabasepage_out:
7093 releasePage(pPage);
7094 return rc;
7095 }
7096
7097 /*
7098 ** Delete all information from a single table in the database. iTable is
7099 ** the page number of the root of the table. After this routine returns,
7100 ** the root page is empty, but still exists.
7101 **
7102 ** This routine will fail with SQLITE_LOCKED if there are any open
7103 ** read cursors on the table. Open write cursors are moved to the
7104 ** root of the table.
7105 **
7106 ** If pnChange is not NULL, then table iTable must be an intkey table. The
7107 ** integer value pointed to by pnChange is incremented by the number of
7108 ** entries in the table.
7109 */
sqlite3BtreeClearTable(Btree * p,int iTable,int * pnChange)7110 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
7111 int rc;
7112 BtShared *pBt = p->pBt;
7113 sqlite3BtreeEnter(p);
7114 assert( p->inTrans==TRANS_WRITE );
7115
7116 /* Invalidate all incrblob cursors open on table iTable (assuming iTable
7117 ** is the root of a table b-tree - if it is not, the following call is
7118 ** a no-op). */
7119 invalidateIncrblobCursors(p, 0, 1);
7120
7121 rc = saveAllCursors(pBt, (Pgno)iTable, 0);
7122 if( SQLITE_OK==rc ){
7123 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
7124 }
7125 sqlite3BtreeLeave(p);
7126 return rc;
7127 }
7128
7129 /*
7130 ** Erase all information in a table and add the root of the table to
7131 ** the freelist. Except, the root of the principle table (the one on
7132 ** page 1) is never added to the freelist.
7133 **
7134 ** This routine will fail with SQLITE_LOCKED if there are any open
7135 ** cursors on the table.
7136 **
7137 ** If AUTOVACUUM is enabled and the page at iTable is not the last
7138 ** root page in the database file, then the last root page
7139 ** in the database file is moved into the slot formerly occupied by
7140 ** iTable and that last slot formerly occupied by the last root page
7141 ** is added to the freelist instead of iTable. In this say, all
7142 ** root pages are kept at the beginning of the database file, which
7143 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the
7144 ** page number that used to be the last root page in the file before
7145 ** the move. If no page gets moved, *piMoved is set to 0.
7146 ** The last root page is recorded in meta[3] and the value of
7147 ** meta[3] is updated by this procedure.
7148 */
btreeDropTable(Btree * p,Pgno iTable,int * piMoved)7149 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
7150 int rc;
7151 MemPage *pPage = 0;
7152 BtShared *pBt = p->pBt;
7153
7154 assert( sqlite3BtreeHoldsMutex(p) );
7155 assert( p->inTrans==TRANS_WRITE );
7156
7157 /* It is illegal to drop a table if any cursors are open on the
7158 ** database. This is because in auto-vacuum mode the backend may
7159 ** need to move another root-page to fill a gap left by the deleted
7160 ** root page. If an open cursor was using this page a problem would
7161 ** occur.
7162 **
7163 ** This error is caught long before control reaches this point.
7164 */
7165 if( NEVER(pBt->pCursor) ){
7166 sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);
7167 return SQLITE_LOCKED_SHAREDCACHE;
7168 }
7169
7170 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
7171 if( rc ) return rc;
7172 rc = sqlite3BtreeClearTable(p, iTable, 0);
7173 if( rc ){
7174 releasePage(pPage);
7175 return rc;
7176 }
7177
7178 *piMoved = 0;
7179
7180 if( iTable>1 ){
7181 #ifdef SQLITE_OMIT_AUTOVACUUM
7182 freePage(pPage, &rc);
7183 releasePage(pPage);
7184 #else
7185 if( pBt->autoVacuum ){
7186 Pgno maxRootPgno;
7187 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
7188
7189 if( iTable==maxRootPgno ){
7190 /* If the table being dropped is the table with the largest root-page
7191 ** number in the database, put the root page on the free list.
7192 */
7193 freePage(pPage, &rc);
7194 releasePage(pPage);
7195 if( rc!=SQLITE_OK ){
7196 return rc;
7197 }
7198 }else{
7199 /* The table being dropped does not have the largest root-page
7200 ** number in the database. So move the page that does into the
7201 ** gap left by the deleted root-page.
7202 */
7203 MemPage *pMove;
7204 releasePage(pPage);
7205 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
7206 if( rc!=SQLITE_OK ){
7207 return rc;
7208 }
7209 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
7210 releasePage(pMove);
7211 if( rc!=SQLITE_OK ){
7212 return rc;
7213 }
7214 pMove = 0;
7215 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
7216 freePage(pMove, &rc);
7217 releasePage(pMove);
7218 if( rc!=SQLITE_OK ){
7219 return rc;
7220 }
7221 *piMoved = maxRootPgno;
7222 }
7223
7224 /* Set the new 'max-root-page' value in the database header. This
7225 ** is the old value less one, less one more if that happens to
7226 ** be a root-page number, less one again if that is the
7227 ** PENDING_BYTE_PAGE.
7228 */
7229 maxRootPgno--;
7230 while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
7231 || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
7232 maxRootPgno--;
7233 }
7234 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
7235
7236 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
7237 }else{
7238 freePage(pPage, &rc);
7239 releasePage(pPage);
7240 }
7241 #endif
7242 }else{
7243 /* If sqlite3BtreeDropTable was called on page 1.
7244 ** This really never should happen except in a corrupt
7245 ** database.
7246 */
7247 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
7248 releasePage(pPage);
7249 }
7250 return rc;
7251 }
sqlite3BtreeDropTable(Btree * p,int iTable,int * piMoved)7252 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
7253 int rc;
7254 sqlite3BtreeEnter(p);
7255 rc = btreeDropTable(p, iTable, piMoved);
7256 sqlite3BtreeLeave(p);
7257 return rc;
7258 }
7259
7260
7261 /*
7262 ** This function may only be called if the b-tree connection already
7263 ** has a read or write transaction open on the database.
7264 **
7265 ** Read the meta-information out of a database file. Meta[0]
7266 ** is the number of free pages currently in the database. Meta[1]
7267 ** through meta[15] are available for use by higher layers. Meta[0]
7268 ** is read-only, the others are read/write.
7269 **
7270 ** The schema layer numbers meta values differently. At the schema
7271 ** layer (and the SetCookie and ReadCookie opcodes) the number of
7272 ** free pages is not visible. So Cookie[0] is the same as Meta[1].
7273 */
sqlite3BtreeGetMeta(Btree * p,int idx,u32 * pMeta)7274 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
7275 BtShared *pBt = p->pBt;
7276
7277 sqlite3BtreeEnter(p);
7278 assert( p->inTrans>TRANS_NONE );
7279 assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
7280 assert( pBt->pPage1 );
7281 assert( idx>=0 && idx<=15 );
7282
7283 *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
7284
7285 /* If auto-vacuum is disabled in this build and this is an auto-vacuum
7286 ** database, mark the database as read-only. */
7287 #ifdef SQLITE_OMIT_AUTOVACUUM
7288 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ) pBt->readOnly = 1;
7289 #endif
7290
7291 sqlite3BtreeLeave(p);
7292 }
7293
7294 /*
7295 ** Write meta-information back into the database. Meta[0] is
7296 ** read-only and may not be written.
7297 */
sqlite3BtreeUpdateMeta(Btree * p,int idx,u32 iMeta)7298 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
7299 BtShared *pBt = p->pBt;
7300 unsigned char *pP1;
7301 int rc;
7302 assert( idx>=1 && idx<=15 );
7303 sqlite3BtreeEnter(p);
7304 assert( p->inTrans==TRANS_WRITE );
7305 assert( pBt->pPage1!=0 );
7306 pP1 = pBt->pPage1->aData;
7307 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
7308 if( rc==SQLITE_OK ){
7309 put4byte(&pP1[36 + idx*4], iMeta);
7310 #ifndef SQLITE_OMIT_AUTOVACUUM
7311 if( idx==BTREE_INCR_VACUUM ){
7312 assert( pBt->autoVacuum || iMeta==0 );
7313 assert( iMeta==0 || iMeta==1 );
7314 pBt->incrVacuum = (u8)iMeta;
7315 }
7316 #endif
7317 }
7318 sqlite3BtreeLeave(p);
7319 return rc;
7320 }
7321
7322 #ifndef SQLITE_OMIT_BTREECOUNT
7323 /*
7324 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
7325 ** number of entries in the b-tree and write the result to *pnEntry.
7326 **
7327 ** SQLITE_OK is returned if the operation is successfully executed.
7328 ** Otherwise, if an error is encountered (i.e. an IO error or database
7329 ** corruption) an SQLite error code is returned.
7330 */
sqlite3BtreeCount(BtCursor * pCur,i64 * pnEntry)7331 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
7332 i64 nEntry = 0; /* Value to return in *pnEntry */
7333 int rc; /* Return code */
7334 rc = moveToRoot(pCur);
7335
7336 /* Unless an error occurs, the following loop runs one iteration for each
7337 ** page in the B-Tree structure (not including overflow pages).
7338 */
7339 while( rc==SQLITE_OK ){
7340 int iIdx; /* Index of child node in parent */
7341 MemPage *pPage; /* Current page of the b-tree */
7342
7343 /* If this is a leaf page or the tree is not an int-key tree, then
7344 ** this page contains countable entries. Increment the entry counter
7345 ** accordingly.
7346 */
7347 pPage = pCur->apPage[pCur->iPage];
7348 if( pPage->leaf || !pPage->intKey ){
7349 nEntry += pPage->nCell;
7350 }
7351
7352 /* pPage is a leaf node. This loop navigates the cursor so that it
7353 ** points to the first interior cell that it points to the parent of
7354 ** the next page in the tree that has not yet been visited. The
7355 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
7356 ** of the page, or to the number of cells in the page if the next page
7357 ** to visit is the right-child of its parent.
7358 **
7359 ** If all pages in the tree have been visited, return SQLITE_OK to the
7360 ** caller.
7361 */
7362 if( pPage->leaf ){
7363 do {
7364 if( pCur->iPage==0 ){
7365 /* All pages of the b-tree have been visited. Return successfully. */
7366 *pnEntry = nEntry;
7367 return SQLITE_OK;
7368 }
7369 moveToParent(pCur);
7370 }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
7371
7372 pCur->aiIdx[pCur->iPage]++;
7373 pPage = pCur->apPage[pCur->iPage];
7374 }
7375
7376 /* Descend to the child node of the cell that the cursor currently
7377 ** points at. This is the right-child if (iIdx==pPage->nCell).
7378 */
7379 iIdx = pCur->aiIdx[pCur->iPage];
7380 if( iIdx==pPage->nCell ){
7381 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
7382 }else{
7383 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
7384 }
7385 }
7386
7387 /* An error has occurred. Return an error code. */
7388 return rc;
7389 }
7390 #endif
7391
7392 /*
7393 ** Return the pager associated with a BTree. This routine is used for
7394 ** testing and debugging only.
7395 */
sqlite3BtreePager(Btree * p)7396 Pager *sqlite3BtreePager(Btree *p){
7397 return p->pBt->pPager;
7398 }
7399
7400 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7401 /*
7402 ** Append a message to the error message string.
7403 */
checkAppendMsg(IntegrityCk * pCheck,char * zMsg1,const char * zFormat,...)7404 static void checkAppendMsg(
7405 IntegrityCk *pCheck,
7406 char *zMsg1,
7407 const char *zFormat,
7408 ...
7409 ){
7410 va_list ap;
7411 if( !pCheck->mxErr ) return;
7412 pCheck->mxErr--;
7413 pCheck->nErr++;
7414 va_start(ap, zFormat);
7415 if( pCheck->errMsg.nChar ){
7416 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
7417 }
7418 if( zMsg1 ){
7419 sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
7420 }
7421 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
7422 va_end(ap);
7423 if( pCheck->errMsg.mallocFailed ){
7424 pCheck->mallocFailed = 1;
7425 }
7426 }
7427 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7428
7429 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7430 /*
7431 ** Add 1 to the reference count for page iPage. If this is the second
7432 ** reference to the page, add an error message to pCheck->zErrMsg.
7433 ** Return 1 if there are 2 ore more references to the page and 0 if
7434 ** if this is the first reference to the page.
7435 **
7436 ** Also check that the page number is in bounds.
7437 */
checkRef(IntegrityCk * pCheck,Pgno iPage,char * zContext)7438 static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){
7439 if( iPage==0 ) return 1;
7440 if( iPage>pCheck->nPage ){
7441 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
7442 return 1;
7443 }
7444 if( pCheck->anRef[iPage]==1 ){
7445 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
7446 return 1;
7447 }
7448 return (pCheck->anRef[iPage]++)>1;
7449 }
7450
7451 #ifndef SQLITE_OMIT_AUTOVACUUM
7452 /*
7453 ** Check that the entry in the pointer-map for page iChild maps to
7454 ** page iParent, pointer type ptrType. If not, append an error message
7455 ** to pCheck.
7456 */
checkPtrmap(IntegrityCk * pCheck,Pgno iChild,u8 eType,Pgno iParent,char * zContext)7457 static void checkPtrmap(
7458 IntegrityCk *pCheck, /* Integrity check context */
7459 Pgno iChild, /* Child page number */
7460 u8 eType, /* Expected pointer map type */
7461 Pgno iParent, /* Expected pointer map parent page number */
7462 char *zContext /* Context description (used for error msg) */
7463 ){
7464 int rc;
7465 u8 ePtrmapType;
7466 Pgno iPtrmapParent;
7467
7468 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
7469 if( rc!=SQLITE_OK ){
7470 if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
7471 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
7472 return;
7473 }
7474
7475 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
7476 checkAppendMsg(pCheck, zContext,
7477 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
7478 iChild, eType, iParent, ePtrmapType, iPtrmapParent);
7479 }
7480 }
7481 #endif
7482
7483 /*
7484 ** Check the integrity of the freelist or of an overflow page list.
7485 ** Verify that the number of pages on the list is N.
7486 */
checkList(IntegrityCk * pCheck,int isFreeList,int iPage,int N,char * zContext)7487 static void checkList(
7488 IntegrityCk *pCheck, /* Integrity checking context */
7489 int isFreeList, /* True for a freelist. False for overflow page list */
7490 int iPage, /* Page number for first page in the list */
7491 int N, /* Expected number of pages in the list */
7492 char *zContext /* Context for error messages */
7493 ){
7494 int i;
7495 int expected = N;
7496 int iFirst = iPage;
7497 while( N-- > 0 && pCheck->mxErr ){
7498 DbPage *pOvflPage;
7499 unsigned char *pOvflData;
7500 if( iPage<1 ){
7501 checkAppendMsg(pCheck, zContext,
7502 "%d of %d pages missing from overflow list starting at %d",
7503 N+1, expected, iFirst);
7504 break;
7505 }
7506 if( checkRef(pCheck, iPage, zContext) ) break;
7507 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
7508 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
7509 break;
7510 }
7511 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
7512 if( isFreeList ){
7513 int n = get4byte(&pOvflData[4]);
7514 #ifndef SQLITE_OMIT_AUTOVACUUM
7515 if( pCheck->pBt->autoVacuum ){
7516 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
7517 }
7518 #endif
7519 if( n>(int)pCheck->pBt->usableSize/4-2 ){
7520 checkAppendMsg(pCheck, zContext,
7521 "freelist leaf count too big on page %d", iPage);
7522 N--;
7523 }else{
7524 for(i=0; i<n; i++){
7525 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
7526 #ifndef SQLITE_OMIT_AUTOVACUUM
7527 if( pCheck->pBt->autoVacuum ){
7528 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
7529 }
7530 #endif
7531 checkRef(pCheck, iFreePage, zContext);
7532 }
7533 N -= n;
7534 }
7535 }
7536 #ifndef SQLITE_OMIT_AUTOVACUUM
7537 else{
7538 /* If this database supports auto-vacuum and iPage is not the last
7539 ** page in this overflow list, check that the pointer-map entry for
7540 ** the following page matches iPage.
7541 */
7542 if( pCheck->pBt->autoVacuum && N>0 ){
7543 i = get4byte(pOvflData);
7544 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
7545 }
7546 }
7547 #endif
7548 iPage = get4byte(pOvflData);
7549 sqlite3PagerUnref(pOvflPage);
7550 }
7551 }
7552 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7553
7554 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7555 /*
7556 ** Do various sanity checks on a single page of a tree. Return
7557 ** the tree depth. Root pages return 0. Parents of root pages
7558 ** return 1, and so forth.
7559 **
7560 ** These checks are done:
7561 **
7562 ** 1. Make sure that cells and freeblocks do not overlap
7563 ** but combine to completely cover the page.
7564 ** NO 2. Make sure cell keys are in order.
7565 ** NO 3. Make sure no key is less than or equal to zLowerBound.
7566 ** NO 4. Make sure no key is greater than or equal to zUpperBound.
7567 ** 5. Check the integrity of overflow pages.
7568 ** 6. Recursively call checkTreePage on all children.
7569 ** 7. Verify that the depth of all children is the same.
7570 ** 8. Make sure this page is at least 33% full or else it is
7571 ** the root of the tree.
7572 */
checkTreePage(IntegrityCk * pCheck,int iPage,char * zParentContext,i64 * pnParentMinKey,i64 * pnParentMaxKey)7573 static int checkTreePage(
7574 IntegrityCk *pCheck, /* Context for the sanity check */
7575 int iPage, /* Page number of the page to check */
7576 char *zParentContext, /* Parent context */
7577 i64 *pnParentMinKey,
7578 i64 *pnParentMaxKey
7579 ){
7580 MemPage *pPage;
7581 int i, rc, depth, d2, pgno, cnt;
7582 int hdr, cellStart;
7583 int nCell;
7584 u8 *data;
7585 BtShared *pBt;
7586 int usableSize;
7587 char zContext[100];
7588 char *hit = 0;
7589 i64 nMinKey = 0;
7590 i64 nMaxKey = 0;
7591
7592 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
7593
7594 /* Check that the page exists
7595 */
7596 pBt = pCheck->pBt;
7597 usableSize = pBt->usableSize;
7598 if( iPage==0 ) return 0;
7599 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
7600 if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
7601 checkAppendMsg(pCheck, zContext,
7602 "unable to get the page. error code=%d", rc);
7603 return 0;
7604 }
7605
7606 /* Clear MemPage.isInit to make sure the corruption detection code in
7607 ** btreeInitPage() is executed. */
7608 pPage->isInit = 0;
7609 if( (rc = btreeInitPage(pPage))!=0 ){
7610 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */
7611 checkAppendMsg(pCheck, zContext,
7612 "btreeInitPage() returns error code %d", rc);
7613 releasePage(pPage);
7614 return 0;
7615 }
7616
7617 /* Check out all the cells.
7618 */
7619 depth = 0;
7620 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
7621 u8 *pCell;
7622 u32 sz;
7623 CellInfo info;
7624
7625 /* Check payload overflow pages
7626 */
7627 sqlite3_snprintf(sizeof(zContext), zContext,
7628 "On tree page %d cell %d: ", iPage, i);
7629 pCell = findCell(pPage,i);
7630 btreeParseCellPtr(pPage, pCell, &info);
7631 sz = info.nData;
7632 if( !pPage->intKey ) sz += (int)info.nKey;
7633 /* For intKey pages, check that the keys are in order.
7634 */
7635 else if( i==0 ) nMinKey = nMaxKey = info.nKey;
7636 else{
7637 if( info.nKey <= nMaxKey ){
7638 checkAppendMsg(pCheck, zContext,
7639 "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey);
7640 }
7641 nMaxKey = info.nKey;
7642 }
7643 assert( sz==info.nPayload );
7644 if( (sz>info.nLocal)
7645 && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize])
7646 ){
7647 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
7648 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
7649 #ifndef SQLITE_OMIT_AUTOVACUUM
7650 if( pBt->autoVacuum ){
7651 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
7652 }
7653 #endif
7654 checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
7655 }
7656
7657 /* Check sanity of left child page.
7658 */
7659 if( !pPage->leaf ){
7660 pgno = get4byte(pCell);
7661 #ifndef SQLITE_OMIT_AUTOVACUUM
7662 if( pBt->autoVacuum ){
7663 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
7664 }
7665 #endif
7666 d2 = checkTreePage(pCheck, pgno, zContext, &nMinKey, i==0 ? NULL : &nMaxKey);
7667 if( i>0 && d2!=depth ){
7668 checkAppendMsg(pCheck, zContext, "Child page depth differs");
7669 }
7670 depth = d2;
7671 }
7672 }
7673
7674 if( !pPage->leaf ){
7675 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
7676 sqlite3_snprintf(sizeof(zContext), zContext,
7677 "On page %d at right child: ", iPage);
7678 #ifndef SQLITE_OMIT_AUTOVACUUM
7679 if( pBt->autoVacuum ){
7680 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
7681 }
7682 #endif
7683 checkTreePage(pCheck, pgno, zContext, NULL, !pPage->nCell ? NULL : &nMaxKey);
7684 }
7685
7686 /* For intKey leaf pages, check that the min/max keys are in order
7687 ** with any left/parent/right pages.
7688 */
7689 if( pPage->leaf && pPage->intKey ){
7690 /* if we are a left child page */
7691 if( pnParentMinKey ){
7692 /* if we are the left most child page */
7693 if( !pnParentMaxKey ){
7694 if( nMaxKey > *pnParentMinKey ){
7695 checkAppendMsg(pCheck, zContext,
7696 "Rowid %lld out of order (max larger than parent min of %lld)",
7697 nMaxKey, *pnParentMinKey);
7698 }
7699 }else{
7700 if( nMinKey <= *pnParentMinKey ){
7701 checkAppendMsg(pCheck, zContext,
7702 "Rowid %lld out of order (min less than parent min of %lld)",
7703 nMinKey, *pnParentMinKey);
7704 }
7705 if( nMaxKey > *pnParentMaxKey ){
7706 checkAppendMsg(pCheck, zContext,
7707 "Rowid %lld out of order (max larger than parent max of %lld)",
7708 nMaxKey, *pnParentMaxKey);
7709 }
7710 *pnParentMinKey = nMaxKey;
7711 }
7712 /* else if we're a right child page */
7713 } else if( pnParentMaxKey ){
7714 if( nMinKey <= *pnParentMaxKey ){
7715 checkAppendMsg(pCheck, zContext,
7716 "Rowid %lld out of order (min less than parent max of %lld)",
7717 nMinKey, *pnParentMaxKey);
7718 }
7719 }
7720 }
7721
7722 /* Check for complete coverage of the page
7723 */
7724 data = pPage->aData;
7725 hdr = pPage->hdrOffset;
7726 hit = sqlite3PageMalloc( pBt->pageSize );
7727 if( hit==0 ){
7728 pCheck->mallocFailed = 1;
7729 }else{
7730 int contentOffset = get2byteNotZero(&data[hdr+5]);
7731 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */
7732 memset(hit+contentOffset, 0, usableSize-contentOffset);
7733 memset(hit, 1, contentOffset);
7734 nCell = get2byte(&data[hdr+3]);
7735 cellStart = hdr + 12 - 4*pPage->leaf;
7736 for(i=0; i<nCell; i++){
7737 int pc = get2byte(&data[cellStart+i*2]);
7738 u32 size = 65536;
7739 int j;
7740 if( pc<=usableSize-4 ){
7741 size = cellSizePtr(pPage, &data[pc]);
7742 }
7743 if( (int)(pc+size-1)>=usableSize ){
7744 checkAppendMsg(pCheck, 0,
7745 "Corruption detected in cell %d on page %d",i,iPage);
7746 }else{
7747 for(j=pc+size-1; j>=pc; j--) hit[j]++;
7748 }
7749 }
7750 i = get2byte(&data[hdr+1]);
7751 while( i>0 ){
7752 int size, j;
7753 assert( i<=usableSize-4 ); /* Enforced by btreeInitPage() */
7754 size = get2byte(&data[i+2]);
7755 assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */
7756 for(j=i+size-1; j>=i; j--) hit[j]++;
7757 j = get2byte(&data[i]);
7758 assert( j==0 || j>i+size ); /* Enforced by btreeInitPage() */
7759 assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */
7760 i = j;
7761 }
7762 for(i=cnt=0; i<usableSize; i++){
7763 if( hit[i]==0 ){
7764 cnt++;
7765 }else if( hit[i]>1 ){
7766 checkAppendMsg(pCheck, 0,
7767 "Multiple uses for byte %d of page %d", i, iPage);
7768 break;
7769 }
7770 }
7771 if( cnt!=data[hdr+7] ){
7772 checkAppendMsg(pCheck, 0,
7773 "Fragmentation of %d bytes reported as %d on page %d",
7774 cnt, data[hdr+7], iPage);
7775 }
7776 }
7777 sqlite3PageFree(hit);
7778 releasePage(pPage);
7779 return depth+1;
7780 }
7781 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7782
7783 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7784 /*
7785 ** This routine does a complete check of the given BTree file. aRoot[] is
7786 ** an array of pages numbers were each page number is the root page of
7787 ** a table. nRoot is the number of entries in aRoot.
7788 **
7789 ** A read-only or read-write transaction must be opened before calling
7790 ** this function.
7791 **
7792 ** Write the number of error seen in *pnErr. Except for some memory
7793 ** allocation errors, an error message held in memory obtained from
7794 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is
7795 ** returned. If a memory allocation error occurs, NULL is returned.
7796 */
sqlite3BtreeIntegrityCheck(Btree * p,int * aRoot,int nRoot,int mxErr,int * pnErr)7797 char *sqlite3BtreeIntegrityCheck(
7798 Btree *p, /* The btree to be checked */
7799 int *aRoot, /* An array of root pages numbers for individual trees */
7800 int nRoot, /* Number of entries in aRoot[] */
7801 int mxErr, /* Stop reporting errors after this many */
7802 int *pnErr /* Write number of errors seen to this variable */
7803 ){
7804 Pgno i;
7805 int nRef;
7806 IntegrityCk sCheck;
7807 BtShared *pBt = p->pBt;
7808 char zErr[100];
7809
7810 sqlite3BtreeEnter(p);
7811 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
7812 nRef = sqlite3PagerRefcount(pBt->pPager);
7813 sCheck.pBt = pBt;
7814 sCheck.pPager = pBt->pPager;
7815 sCheck.nPage = btreePagecount(sCheck.pBt);
7816 sCheck.mxErr = mxErr;
7817 sCheck.nErr = 0;
7818 sCheck.mallocFailed = 0;
7819 *pnErr = 0;
7820 if( sCheck.nPage==0 ){
7821 sqlite3BtreeLeave(p);
7822 return 0;
7823 }
7824 sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
7825 if( !sCheck.anRef ){
7826 *pnErr = 1;
7827 sqlite3BtreeLeave(p);
7828 return 0;
7829 }
7830 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
7831 i = PENDING_BYTE_PAGE(pBt);
7832 if( i<=sCheck.nPage ){
7833 sCheck.anRef[i] = 1;
7834 }
7835 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
7836 sCheck.errMsg.useMalloc = 2;
7837
7838 /* Check the integrity of the freelist
7839 */
7840 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
7841 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
7842
7843 /* Check all the tables.
7844 */
7845 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
7846 if( aRoot[i]==0 ) continue;
7847 #ifndef SQLITE_OMIT_AUTOVACUUM
7848 if( pBt->autoVacuum && aRoot[i]>1 ){
7849 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
7850 }
7851 #endif
7852 checkTreePage(&sCheck, aRoot[i], "List of tree roots: ", NULL, NULL);
7853 }
7854
7855 /* Make sure every page in the file is referenced
7856 */
7857 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
7858 #ifdef SQLITE_OMIT_AUTOVACUUM
7859 if( sCheck.anRef[i]==0 ){
7860 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
7861 }
7862 #else
7863 /* If the database supports auto-vacuum, make sure no tables contain
7864 ** references to pointer-map pages.
7865 */
7866 if( sCheck.anRef[i]==0 &&
7867 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
7868 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
7869 }
7870 if( sCheck.anRef[i]!=0 &&
7871 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
7872 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
7873 }
7874 #endif
7875 }
7876
7877 /* Make sure this analysis did not leave any unref() pages.
7878 ** This is an internal consistency check; an integrity check
7879 ** of the integrity check.
7880 */
7881 if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){
7882 checkAppendMsg(&sCheck, 0,
7883 "Outstanding page count goes from %d to %d during this analysis",
7884 nRef, sqlite3PagerRefcount(pBt->pPager)
7885 );
7886 }
7887
7888 /* Clean up and report errors.
7889 */
7890 sqlite3BtreeLeave(p);
7891 sqlite3_free(sCheck.anRef);
7892 if( sCheck.mallocFailed ){
7893 sqlite3StrAccumReset(&sCheck.errMsg);
7894 *pnErr = sCheck.nErr+1;
7895 return 0;
7896 }
7897 *pnErr = sCheck.nErr;
7898 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
7899 return sqlite3StrAccumFinish(&sCheck.errMsg);
7900 }
7901 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7902
7903 /*
7904 ** Return the full pathname of the underlying database file.
7905 **
7906 ** The pager filename is invariant as long as the pager is
7907 ** open so it is safe to access without the BtShared mutex.
7908 */
sqlite3BtreeGetFilename(Btree * p)7909 const char *sqlite3BtreeGetFilename(Btree *p){
7910 assert( p->pBt->pPager!=0 );
7911 return sqlite3PagerFilename(p->pBt->pPager);
7912 }
7913
7914 /*
7915 ** Return the pathname of the journal file for this database. The return
7916 ** value of this routine is the same regardless of whether the journal file
7917 ** has been created or not.
7918 **
7919 ** The pager journal filename is invariant as long as the pager is
7920 ** open so it is safe to access without the BtShared mutex.
7921 */
sqlite3BtreeGetJournalname(Btree * p)7922 const char *sqlite3BtreeGetJournalname(Btree *p){
7923 assert( p->pBt->pPager!=0 );
7924 return sqlite3PagerJournalname(p->pBt->pPager);
7925 }
7926
7927 /*
7928 ** Return non-zero if a transaction is active.
7929 */
sqlite3BtreeIsInTrans(Btree * p)7930 int sqlite3BtreeIsInTrans(Btree *p){
7931 assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
7932 return (p && (p->inTrans==TRANS_WRITE));
7933 }
7934
7935 #ifndef SQLITE_OMIT_WAL
7936 /*
7937 ** Run a checkpoint on the Btree passed as the first argument.
7938 **
7939 ** Return SQLITE_LOCKED if this or any other connection has an open
7940 ** transaction on the shared-cache the argument Btree is connected to.
7941 **
7942 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
7943 */
sqlite3BtreeCheckpoint(Btree * p,int eMode,int * pnLog,int * pnCkpt)7944 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
7945 int rc = SQLITE_OK;
7946 if( p ){
7947 BtShared *pBt = p->pBt;
7948 sqlite3BtreeEnter(p);
7949 if( pBt->inTransaction!=TRANS_NONE ){
7950 rc = SQLITE_LOCKED;
7951 }else{
7952 rc = sqlite3PagerCheckpoint(pBt->pPager, eMode, pnLog, pnCkpt);
7953 }
7954 sqlite3BtreeLeave(p);
7955 }
7956 return rc;
7957 }
7958 #endif
7959
7960 /*
7961 ** Return non-zero if a read (or write) transaction is active.
7962 */
sqlite3BtreeIsInReadTrans(Btree * p)7963 int sqlite3BtreeIsInReadTrans(Btree *p){
7964 assert( p );
7965 assert( sqlite3_mutex_held(p->db->mutex) );
7966 return p->inTrans!=TRANS_NONE;
7967 }
7968
sqlite3BtreeIsInBackup(Btree * p)7969 int sqlite3BtreeIsInBackup(Btree *p){
7970 assert( p );
7971 assert( sqlite3_mutex_held(p->db->mutex) );
7972 return p->nBackup!=0;
7973 }
7974
7975 /*
7976 ** This function returns a pointer to a blob of memory associated with
7977 ** a single shared-btree. The memory is used by client code for its own
7978 ** purposes (for example, to store a high-level schema associated with
7979 ** the shared-btree). The btree layer manages reference counting issues.
7980 **
7981 ** The first time this is called on a shared-btree, nBytes bytes of memory
7982 ** are allocated, zeroed, and returned to the caller. For each subsequent
7983 ** call the nBytes parameter is ignored and a pointer to the same blob
7984 ** of memory returned.
7985 **
7986 ** If the nBytes parameter is 0 and the blob of memory has not yet been
7987 ** allocated, a null pointer is returned. If the blob has already been
7988 ** allocated, it is returned as normal.
7989 **
7990 ** Just before the shared-btree is closed, the function passed as the
7991 ** xFree argument when the memory allocation was made is invoked on the
7992 ** blob of allocated memory. The xFree function should not call sqlite3_free()
7993 ** on the memory, the btree layer does that.
7994 */
sqlite3BtreeSchema(Btree * p,int nBytes,void (* xFree)(void *))7995 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
7996 BtShared *pBt = p->pBt;
7997 sqlite3BtreeEnter(p);
7998 if( !pBt->pSchema && nBytes ){
7999 pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
8000 pBt->xFreeSchema = xFree;
8001 }
8002 sqlite3BtreeLeave(p);
8003 return pBt->pSchema;
8004 }
8005
8006 /*
8007 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
8008 ** btree as the argument handle holds an exclusive lock on the
8009 ** sqlite_master table. Otherwise SQLITE_OK.
8010 */
sqlite3BtreeSchemaLocked(Btree * p)8011 int sqlite3BtreeSchemaLocked(Btree *p){
8012 int rc;
8013 assert( sqlite3_mutex_held(p->db->mutex) );
8014 sqlite3BtreeEnter(p);
8015 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
8016 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
8017 sqlite3BtreeLeave(p);
8018 return rc;
8019 }
8020
8021
8022 #ifndef SQLITE_OMIT_SHARED_CACHE
8023 /*
8024 ** Obtain a lock on the table whose root page is iTab. The
8025 ** lock is a write lock if isWritelock is true or a read lock
8026 ** if it is false.
8027 */
sqlite3BtreeLockTable(Btree * p,int iTab,u8 isWriteLock)8028 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
8029 int rc = SQLITE_OK;
8030 assert( p->inTrans!=TRANS_NONE );
8031 if( p->sharable ){
8032 u8 lockType = READ_LOCK + isWriteLock;
8033 assert( READ_LOCK+1==WRITE_LOCK );
8034 assert( isWriteLock==0 || isWriteLock==1 );
8035
8036 sqlite3BtreeEnter(p);
8037 rc = querySharedCacheTableLock(p, iTab, lockType);
8038 if( rc==SQLITE_OK ){
8039 rc = setSharedCacheTableLock(p, iTab, lockType);
8040 }
8041 sqlite3BtreeLeave(p);
8042 }
8043 return rc;
8044 }
8045 #endif
8046
8047 #ifndef SQLITE_OMIT_INCRBLOB
8048 /*
8049 ** Argument pCsr must be a cursor opened for writing on an
8050 ** INTKEY table currently pointing at a valid table entry.
8051 ** This function modifies the data stored as part of that entry.
8052 **
8053 ** Only the data content may only be modified, it is not possible to
8054 ** change the length of the data stored. If this function is called with
8055 ** parameters that attempt to write past the end of the existing data,
8056 ** no modifications are made and SQLITE_CORRUPT is returned.
8057 */
sqlite3BtreePutData(BtCursor * pCsr,u32 offset,u32 amt,void * z)8058 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
8059 int rc;
8060 assert( cursorHoldsMutex(pCsr) );
8061 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
8062 assert( pCsr->isIncrblobHandle );
8063
8064 rc = restoreCursorPosition(pCsr);
8065 if( rc!=SQLITE_OK ){
8066 return rc;
8067 }
8068 assert( pCsr->eState!=CURSOR_REQUIRESEEK );
8069 if( pCsr->eState!=CURSOR_VALID ){
8070 return SQLITE_ABORT;
8071 }
8072
8073 /* Check some assumptions:
8074 ** (a) the cursor is open for writing,
8075 ** (b) there is a read/write transaction open,
8076 ** (c) the connection holds a write-lock on the table (if required),
8077 ** (d) there are no conflicting read-locks, and
8078 ** (e) the cursor points at a valid row of an intKey table.
8079 */
8080 if( !pCsr->wrFlag ){
8081 return SQLITE_READONLY;
8082 }
8083 assert( !pCsr->pBt->readOnly && pCsr->pBt->inTransaction==TRANS_WRITE );
8084 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
8085 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
8086 assert( pCsr->apPage[pCsr->iPage]->intKey );
8087
8088 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
8089 }
8090
8091 /*
8092 ** Set a flag on this cursor to cache the locations of pages from the
8093 ** overflow list for the current row. This is used by cursors opened
8094 ** for incremental blob IO only.
8095 **
8096 ** This function sets a flag only. The actual page location cache
8097 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
8098 ** accessPayload() (the worker function for sqlite3BtreeData() and
8099 ** sqlite3BtreePutData()).
8100 */
sqlite3BtreeCacheOverflow(BtCursor * pCur)8101 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
8102 assert( cursorHoldsMutex(pCur) );
8103 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
8104 invalidateOverflowCache(pCur);
8105 pCur->isIncrblobHandle = 1;
8106 }
8107 #endif
8108
8109 /*
8110 ** Set both the "read version" (single byte at byte offset 18) and
8111 ** "write version" (single byte at byte offset 19) fields in the database
8112 ** header to iVersion.
8113 */
sqlite3BtreeSetVersion(Btree * pBtree,int iVersion)8114 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
8115 BtShared *pBt = pBtree->pBt;
8116 int rc; /* Return code */
8117
8118 assert( pBtree->inTrans==TRANS_NONE );
8119 assert( iVersion==1 || iVersion==2 );
8120
8121 /* If setting the version fields to 1, do not automatically open the
8122 ** WAL connection, even if the version fields are currently set to 2.
8123 */
8124 pBt->doNotUseWAL = (u8)(iVersion==1);
8125
8126 rc = sqlite3BtreeBeginTrans(pBtree, 0);
8127 if( rc==SQLITE_OK ){
8128 u8 *aData = pBt->pPage1->aData;
8129 if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
8130 rc = sqlite3BtreeBeginTrans(pBtree, 2);
8131 if( rc==SQLITE_OK ){
8132 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
8133 if( rc==SQLITE_OK ){
8134 aData[18] = (u8)iVersion;
8135 aData[19] = (u8)iVersion;
8136 }
8137 }
8138 }
8139 }
8140
8141 pBt->doNotUseWAL = 0;
8142 return rc;
8143 }
8144