1 /*
2 ** 2012 Jan 11
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
10 */
11 /* TODO(shess): THIS MODULE IS STILL EXPERIMENTAL. DO NOT USE IT. */
12 /* Implements a virtual table "recover" which can be used to recover
13 * data from a corrupt table. The table is walked manually, with
14 * corrupt items skipped. Additionally, any errors while reading will
15 * be skipped.
16 *
17 * Given a table with this definition:
18 *
19 * CREATE TABLE Stuff (
20 * name TEXT PRIMARY KEY,
21 * value TEXT NOT NULL
22 * );
23 *
24 * to recover the data from teh table, you could do something like:
25 *
26 * -- Attach another database, the original is not trustworthy.
27 * ATTACH DATABASE '/tmp/db.db' AS rdb;
28 * -- Create a new version of the table.
29 * CREATE TABLE rdb.Stuff (
30 * name TEXT PRIMARY KEY,
31 * value TEXT NOT NULL
32 * );
33 * -- This will read the original table's data.
34 * CREATE VIRTUAL TABLE temp.recover_Stuff using recover(
35 * main.Stuff,
36 * name TEXT STRICT NOT NULL, -- only real TEXT data allowed
37 * value TEXT STRICT NOT NULL
38 * );
39 * -- Corruption means the UNIQUE constraint may no longer hold for
40 * -- Stuff, so either OR REPLACE or OR IGNORE must be used.
41 * INSERT OR REPLACE INTO rdb.Stuff (rowid, name, value )
42 * SELECT rowid, name, value FROM temp.recover_Stuff;
43 * DROP TABLE temp.recover_Stuff;
44 * DETACH DATABASE rdb;
45 * -- Move db.db to replace original db in filesystem.
46 *
47 *
48 * Usage
49 *
50 * Given the goal of dealing with corruption, it would not be safe to
51 * create a recovery table in the database being recovered. So
52 * recovery tables must be created in the temp database. They are not
53 * appropriate to persist, in any case. [As a bonus, sqlite_master
54 * tables can be recovered. Perhaps more cute than useful, though.]
55 *
56 * The parameters are a specifier for the table to read, and a column
57 * definition for each bit of data stored in that table. The named
58 * table must be convertable to a root page number by reading the
59 * sqlite_master table. Bare table names are assumed to be in
60 * database 0 ("main"), other databases can be specified in db.table
61 * fashion.
62 *
63 * Column definitions are similar to BUT NOT THE SAME AS those
64 * provided to CREATE statements:
65 * column-def: column-name [type-name [STRICT] [NOT NULL]]
66 * type-name: (ANY|ROWID|INTEGER|FLOAT|NUMERIC|TEXT|BLOB)
67 *
68 * Only those exact type names are accepted, there is no type
69 * intuition. The only constraints accepted are STRICT (see below)
70 * and NOT NULL. Anything unexpected will cause the create to fail.
71 *
72 * ANY is a convenience to indicate that manifest typing is desired.
73 * It is equivalent to not specifying a type at all. The results for
74 * such columns will have the type of the data's storage. The exposed
75 * schema will contain no type for that column.
76 *
77 * ROWID is used for columns representing aliases to the rowid
78 * (INTEGER PRIMARY KEY, with or without AUTOINCREMENT), to make the
79 * concept explicit. Such columns are actually stored as NULL, so
80 * they cannot be simply ignored. The exposed schema will be INTEGER
81 * for that column.
82 *
83 * NOT NULL causes rows with a NULL in that column to be skipped. It
84 * also adds NOT NULL to the column in the exposed schema. If the
85 * table has ever had columns added using ALTER TABLE, then those
86 * columns implicitly contain NULL for rows which have not been
87 * updated. [Workaround using COALESCE() in your SELECT statement.]
88 *
89 * The created table is read-only, with no indices. Any SELECT will
90 * be a full-table scan, returning each valid row read from the
91 * storage of the backing table. The rowid will be the rowid of the
92 * row from the backing table. "Valid" means:
93 * - The cell metadata for the row is well-formed. Mainly this means that
94 * the cell header info describes a payload of the size indicated by
95 * the cell's payload size.
96 * - The cell does not run off the page.
97 * - The cell does not overlap any other cell on the page.
98 * - The cell contains doesn't contain too many columns.
99 * - The types of the serialized data match the indicated types (see below).
100 *
101 *
102 * Type affinity versus type storage.
103 *
104 * http://www.sqlite.org/datatype3.html describes SQLite's type
105 * affinity system. The system provides for automated coercion of
106 * types in certain cases, transparently enough that many developers
107 * do not realize that it is happening. Importantly, it implies that
108 * the raw data stored in the database may not have the obvious type.
109 *
110 * Differences between the stored data types and the expected data
111 * types may be a signal of corruption. This module makes some
112 * allowances for automatic coercion. It is important to be concious
113 * of the difference between the schema exposed by the module, and the
114 * data types read from storage. The following table describes how
115 * the module interprets things:
116 *
117 * type schema data STRICT
118 * ---- ------ ---- ------
119 * ANY <none> any any
120 * ROWID INTEGER n/a n/a
121 * INTEGER INTEGER integer integer
122 * FLOAT FLOAT integer or float float
123 * NUMERIC NUMERIC integer, float, or text integer or float
124 * TEXT TEXT text or blob text
125 * BLOB BLOB blob blob
126 *
127 * type is the type provided to the recover module, schema is the
128 * schema exposed by the module, data is the acceptable types of data
129 * decoded from storage, and STRICT is a modification of that.
130 *
131 * A very loose recovery system might use ANY for all columns, then
132 * use the appropriate sqlite3_column_*() calls to coerce to expected
133 * types. This doesn't provide much protection if a page from a
134 * different table with the same column count is linked into an
135 * inappropriate btree.
136 *
137 * A very tight recovery system might use STRICT to enforce typing on
138 * all columns, preferring to skip rows which are valid at the storage
139 * level but don't contain the right types. Note that FLOAT STRICT is
140 * almost certainly not appropriate, since integral values are
141 * transparently stored as integers, when that is more efficient.
142 *
143 * Another option is to use ANY for all columns and inspect each
144 * result manually (using sqlite3_column_*). This should only be
145 * necessary in cases where developers have used manifest typing (test
146 * to make sure before you decide that you aren't using manifest
147 * typing!).
148 *
149 *
150 * Caveats
151 *
152 * Leaf pages not referenced by interior nodes will not be found.
153 *
154 * Leaf pages referenced from interior nodes of other tables will not
155 * be resolved.
156 *
157 * Rows referencing invalid overflow pages will be skipped.
158 *
159 * SQlite rows have a header which describes how to interpret the rest
160 * of the payload. The header can be valid in cases where the rest of
161 * the record is actually corrupt (in the sense that the data is not
162 * the intended data). This can especially happen WRT overflow pages,
163 * as lack of atomic updates between pages is the primary form of
164 * corruption I have seen in the wild.
165 */
166 /* The implementation is via a series of cursors. The cursor
167 * implementations follow the pattern:
168 *
169 * // Creates the cursor using various initialization info.
170 * int cursorCreate(...);
171 *
172 * // Returns 1 if there is no more data, 0 otherwise.
173 * int cursorEOF(Cursor *pCursor);
174 *
175 * // Various accessors can be used if not at EOF.
176 *
177 * // Move to the next item.
178 * int cursorNext(Cursor *pCursor);
179 *
180 * // Destroy the memory associated with the cursor.
181 * void cursorDestroy(Cursor *pCursor);
182 *
183 * References in the following are to sections at
184 * http://www.sqlite.org/fileformat2.html .
185 *
186 * RecoverLeafCursor iterates the records in a leaf table node
187 * described in section 1.5 "B-tree Pages". When the node is
188 * exhausted, an interior cursor is used to get the next leaf node,
189 * and iteration continues there.
190 *
191 * RecoverInteriorCursor iterates the child pages in an interior table
192 * node described in section 1.5 "B-tree Pages". When the node is
193 * exhausted, a parent interior cursor is used to get the next
194 * interior node at the same level, and iteration continues there.
195 *
196 * Together these record the path from the leaf level to the root of
197 * the tree. Iteration happens from the leaves rather than the root
198 * both for efficiency and putting the special case at the front of
199 * the list is easier to implement.
200 *
201 * RecoverCursor uses a RecoverLeafCursor to iterate the rows of a
202 * table, returning results via the SQLite virtual table interface.
203 */
204 /* TODO(shess): It might be useful to allow DEFAULT in types to
205 * specify what to do for NULL when an ALTER TABLE case comes up.
206 * Unfortunately, simply adding it to the exposed schema and using
207 * sqlite3_result_null() does not cause the default to be generate.
208 * Handling it ourselves seems hard, unfortunately.
209 */
210
211 #include <assert.h>
212 #include <ctype.h>
213 #include <stdio.h>
214 #include <string.h>
215
216 /* Internal SQLite things that are used:
217 * u32, u64, i64 types.
218 * Btree, Pager, and DbPage structs.
219 * DbPage.pData, .pPager, and .pgno
220 * sqlite3 struct.
221 * sqlite3BtreePager() and sqlite3BtreeGetPageSize()
222 * sqlite3PagerAcquire() and sqlite3PagerUnref()
223 * getVarint().
224 */
225 #include "sqliteInt.h"
226
227 /* For debugging. */
228 #if 0
229 #define FNENTRY() fprintf(stderr, "In %s\n", __FUNCTION__)
230 #else
231 #define FNENTRY()
232 #endif
233
234 /* Generic constants and helper functions. */
235
236 static const unsigned char kTableLeafPage = 0x0D;
237 static const unsigned char kTableInteriorPage = 0x05;
238
239 /* From section 1.5. */
240 static const unsigned kiPageTypeOffset = 0;
241 static const unsigned kiPageFreeBlockOffset = 1;
242 static const unsigned kiPageCellCountOffset = 3;
243 static const unsigned kiPageCellContentOffset = 5;
244 static const unsigned kiPageFragmentedBytesOffset = 7;
245 static const unsigned knPageLeafHeaderBytes = 8;
246 /* Interior pages contain an additional field. */
247 static const unsigned kiPageRightChildOffset = 8;
248 static const unsigned kiPageInteriorHeaderBytes = 12;
249
250 /* Accepted types are specified by a mask. */
251 #define MASK_ROWID (1<<0)
252 #define MASK_INTEGER (1<<1)
253 #define MASK_FLOAT (1<<2)
254 #define MASK_TEXT (1<<3)
255 #define MASK_BLOB (1<<4)
256 #define MASK_NULL (1<<5)
257
258 /* Helpers to decode fixed-size fields. */
decodeUnsigned16(const unsigned char * pData)259 static u32 decodeUnsigned16(const unsigned char *pData){
260 return (pData[0]<<8) + pData[1];
261 }
decodeUnsigned32(const unsigned char * pData)262 static u32 decodeUnsigned32(const unsigned char *pData){
263 return (decodeUnsigned16(pData)<<16) + decodeUnsigned16(pData+2);
264 }
decodeSigned(const unsigned char * pData,unsigned nBytes)265 static i64 decodeSigned(const unsigned char *pData, unsigned nBytes){
266 i64 r = (char)(*pData);
267 while( --nBytes ){
268 r <<= 8;
269 r += *(++pData);
270 }
271 return r;
272 }
273 /* Derived from vdbeaux.c, sqlite3VdbeSerialGet(), case 7. */
274 /* TODO(shess): Determine if swapMixedEndianFloat() applies. */
decodeFloat64(const unsigned char * pData)275 static double decodeFloat64(const unsigned char *pData){
276 #if !defined(NDEBUG)
277 static const u64 t1 = ((u64)0x3ff00000)<<32;
278 static const double r1 = 1.0;
279 u64 t2 = t1;
280 assert( sizeof(r1)==sizeof(t2) && memcmp(&r1, &t2, sizeof(r1))==0 );
281 #endif
282 i64 x = decodeSigned(pData, 8);
283 double d;
284 memcpy(&d, &x, sizeof(x));
285 return d;
286 }
287
288 /* Return true if a varint can safely be read from pData/nData. */
289 /* TODO(shess): DbPage points into the middle of a buffer which
290 * contains the page data before DbPage. So code should always be
291 * able to read a small number of varints safely. Consider whether to
292 * trust that or not.
293 */
checkVarint(const unsigned char * pData,unsigned nData)294 static int checkVarint(const unsigned char *pData, unsigned nData){
295 unsigned i;
296
297 /* In the worst case the decoder takes all 8 bits of the 9th byte. */
298 if( nData>=9 ){
299 return 1;
300 }
301
302 /* Look for a high-bit-clear byte in what's left. */
303 for( i=0; i<nData; ++i ){
304 if( !(pData[i]&0x80) ){
305 return 1;
306 }
307 }
308
309 /* Cannot decode in the space given. */
310 return 0;
311 }
312
313 /* Return 1 if n varints can be read from pData/nData. */
checkVarints(const unsigned char * pData,unsigned nData,unsigned n)314 static int checkVarints(const unsigned char *pData, unsigned nData,
315 unsigned n){
316 unsigned nCur = 0; /* Byte offset within current varint. */
317 unsigned nFound = 0; /* Number of varints found. */
318 unsigned i;
319
320 /* In the worst case the decoder takes all 8 bits of the 9th byte. */
321 if( nData>=9*n ){
322 return 1;
323 }
324
325 for( i=0; nFound<n && i<nData; ++i ){
326 nCur++;
327 if( nCur==9 || !(pData[i]&0x80) ){
328 nFound++;
329 nCur = 0;
330 }
331 }
332
333 return nFound==n;
334 }
335
336 /* ctype and str[n]casecmp() can be affected by locale (eg, tr_TR).
337 * These versions consider only the ASCII space.
338 */
339 /* TODO(shess): It may be reasonable to just remove the need for these
340 * entirely. The module could require "TEXT STRICT NOT NULL", not
341 * "Text Strict Not Null" or whatever the developer felt like typing
342 * that day. Handling corrupt data is a PERFECT place to be pedantic.
343 */
ascii_isspace(char c)344 static int ascii_isspace(char c){
345 /* From fts3_expr.c */
346 return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
347 }
ascii_isalnum(int x)348 static int ascii_isalnum(int x){
349 /* From fts3_tokenizer1.c */
350 return (x>='0' && x<='9') || (x>='A' && x<='Z') || (x>='a' && x<='z');
351 }
ascii_tolower(int x)352 static int ascii_tolower(int x){
353 /* From fts3_tokenizer1.c */
354 return (x>='A' && x<='Z') ? x-'A'+'a' : x;
355 }
356 /* TODO(shess): Consider sqlite3_strnicmp() */
ascii_strncasecmp(const char * s1,const char * s2,size_t n)357 static int ascii_strncasecmp(const char *s1, const char *s2, size_t n){
358 const unsigned char *us1 = (const unsigned char *)s1;
359 const unsigned char *us2 = (const unsigned char *)s2;
360 while( *us1 && *us2 && n && ascii_tolower(*us1)==ascii_tolower(*us2) ){
361 us1++, us2++, n--;
362 }
363 return n ? ascii_tolower(*us1)-ascii_tolower(*us2) : 0;
364 }
ascii_strcasecmp(const char * s1,const char * s2)365 static int ascii_strcasecmp(const char *s1, const char *s2){
366 /* If s2 is equal through strlen(s1), will exit while() due to s1's
367 * trailing NUL, and return NUL-s2[strlen(s1)].
368 */
369 return ascii_strncasecmp(s1, s2, strlen(s1)+1);
370 }
371
372 /* For some reason I kept making mistakes with offset calculations. */
PageData(DbPage * pPage,unsigned iOffset)373 static const unsigned char *PageData(DbPage *pPage, unsigned iOffset){
374 assert( iOffset<=pPage->nPageSize );
375 return (unsigned char *)pPage->pData + iOffset;
376 }
377
378 /* The first page in the file contains a file header in the first 100
379 * bytes. The page's header information comes after that. Note that
380 * the offsets in the page's header information are relative to the
381 * beginning of the page, NOT the end of the page header.
382 */
PageHeader(DbPage * pPage)383 static const unsigned char *PageHeader(DbPage *pPage){
384 if( pPage->pgno==1 ){
385 const unsigned nDatabaseHeader = 100;
386 return PageData(pPage, nDatabaseHeader);
387 }else{
388 return PageData(pPage, 0);
389 }
390 }
391
392 /* Helper to fetch the pager and page size for the named database. */
GetPager(sqlite3 * db,const char * zName,Pager ** pPager,unsigned * pnPageSize)393 static int GetPager(sqlite3 *db, const char *zName,
394 Pager **pPager, unsigned *pnPageSize){
395 Btree *pBt = NULL;
396 int i;
397 for( i=0; i<db->nDb; ++i ){
398 if( ascii_strcasecmp(db->aDb[i].zName, zName)==0 ){
399 pBt = db->aDb[i].pBt;
400 break;
401 }
402 }
403 if( !pBt ){
404 return SQLITE_ERROR;
405 }
406
407 *pPager = sqlite3BtreePager(pBt);
408 *pnPageSize = sqlite3BtreeGetPageSize(pBt) - sqlite3BtreeGetReserve(pBt);
409 return SQLITE_OK;
410 }
411
412 /* iSerialType is a type read from a record header. See "2.1 Record Format".
413 */
414
415 /* Storage size of iSerialType in bytes. My interpretation of SQLite
416 * documentation is that text and blob fields can have 32-bit length.
417 * Values past 2^31-12 will need more than 32 bits to encode, which is
418 * why iSerialType is u64.
419 */
SerialTypeLength(u64 iSerialType)420 static u32 SerialTypeLength(u64 iSerialType){
421 switch( iSerialType ){
422 case 0 : return 0; /* NULL */
423 case 1 : return 1; /* Various integers. */
424 case 2 : return 2;
425 case 3 : return 3;
426 case 4 : return 4;
427 case 5 : return 6;
428 case 6 : return 8;
429 case 7 : return 8; /* 64-bit float. */
430 case 8 : return 0; /* Constant 0. */
431 case 9 : return 0; /* Constant 1. */
432 case 10 : case 11 : assert( !"RESERVED TYPE"); return 0;
433 }
434 return (u32)((iSerialType>>1) - 6);
435 }
436
437 /* True if iSerialType refers to a blob. */
SerialTypeIsBlob(u64 iSerialType)438 static int SerialTypeIsBlob(u64 iSerialType){
439 assert( iSerialType>=12 );
440 return (iSerialType%2)==0;
441 }
442
443 /* Returns true if the serialized type represented by iSerialType is
444 * compatible with the given type mask.
445 */
SerialTypeIsCompatible(u64 iSerialType,unsigned char mask)446 static int SerialTypeIsCompatible(u64 iSerialType, unsigned char mask){
447 switch( iSerialType ){
448 case 0 : return (mask&MASK_NULL)!=0;
449 case 1 : return (mask&MASK_INTEGER)!=0;
450 case 2 : return (mask&MASK_INTEGER)!=0;
451 case 3 : return (mask&MASK_INTEGER)!=0;
452 case 4 : return (mask&MASK_INTEGER)!=0;
453 case 5 : return (mask&MASK_INTEGER)!=0;
454 case 6 : return (mask&MASK_INTEGER)!=0;
455 case 7 : return (mask&MASK_FLOAT)!=0;
456 case 8 : return (mask&MASK_INTEGER)!=0;
457 case 9 : return (mask&MASK_INTEGER)!=0;
458 case 10 : assert( !"RESERVED TYPE"); return 0;
459 case 11 : assert( !"RESERVED TYPE"); return 0;
460 }
461 return (mask&(SerialTypeIsBlob(iSerialType) ? MASK_BLOB : MASK_TEXT));
462 }
463
464 /* Versions of strdup() with return values appropriate for
465 * sqlite3_free(). malloc.c has sqlite3DbStrDup()/NDup(), but those
466 * need sqlite3DbFree(), which seems intrusive.
467 */
sqlite3_strndup(const char * z,unsigned n)468 static char *sqlite3_strndup(const char *z, unsigned n){
469 char *zNew;
470
471 if( z==NULL ){
472 return NULL;
473 }
474
475 zNew = sqlite3_malloc(n+1);
476 if( zNew!=NULL ){
477 memcpy(zNew, z, n);
478 zNew[n] = '\0';
479 }
480 return zNew;
481 }
sqlite3_strdup(const char * z)482 static char *sqlite3_strdup(const char *z){
483 if( z==NULL ){
484 return NULL;
485 }
486 return sqlite3_strndup(z, strlen(z));
487 }
488
489 /* Fetch the page number of zTable in zDb from sqlite_master in zDb,
490 * and put it in *piRootPage.
491 */
getRootPage(sqlite3 * db,const char * zDb,const char * zTable,u32 * piRootPage)492 static int getRootPage(sqlite3 *db, const char *zDb, const char *zTable,
493 u32 *piRootPage){
494 char *zSql; /* SQL selecting root page of named element. */
495 sqlite3_stmt *pStmt;
496 int rc;
497
498 if( strcmp(zTable, "sqlite_master")==0 ){
499 *piRootPage = 1;
500 return SQLITE_OK;
501 }
502
503 zSql = sqlite3_mprintf("SELECT rootpage FROM %s.sqlite_master "
504 "WHERE type = 'table' AND tbl_name = %Q",
505 zDb, zTable);
506 if( !zSql ){
507 return SQLITE_NOMEM;
508 }
509
510 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
511 sqlite3_free(zSql);
512 if( rc!=SQLITE_OK ){
513 return rc;
514 }
515
516 /* Require a result. */
517 rc = sqlite3_step(pStmt);
518 if( rc==SQLITE_DONE ){
519 rc = SQLITE_CORRUPT;
520 }else if( rc==SQLITE_ROW ){
521 *piRootPage = sqlite3_column_int(pStmt, 0);
522
523 /* Require only one result. */
524 rc = sqlite3_step(pStmt);
525 if( rc==SQLITE_DONE ){
526 rc = SQLITE_OK;
527 }else if( rc==SQLITE_ROW ){
528 rc = SQLITE_CORRUPT;
529 }
530 }
531 sqlite3_finalize(pStmt);
532 return rc;
533 }
534
getEncoding(sqlite3 * db,const char * zDb,int * piEncoding)535 static int getEncoding(sqlite3 *db, const char *zDb, int* piEncoding){
536 sqlite3_stmt *pStmt;
537 int rc;
538 char *zSql = sqlite3_mprintf("PRAGMA %s.encoding", zDb);
539 if( !zSql ){
540 return SQLITE_NOMEM;
541 }
542
543 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
544 sqlite3_free(zSql);
545 if( rc!=SQLITE_OK ){
546 return rc;
547 }
548
549 /* Require a result. */
550 rc = sqlite3_step(pStmt);
551 if( rc==SQLITE_DONE ){
552 /* This case should not be possible. */
553 rc = SQLITE_CORRUPT;
554 }else if( rc==SQLITE_ROW ){
555 if( sqlite3_column_type(pStmt, 0)==SQLITE_TEXT ){
556 const char* z = (const char *)sqlite3_column_text(pStmt, 0);
557 /* These strings match the literals in pragma.c. */
558 if( !strcmp(z, "UTF-16le") ){
559 *piEncoding = SQLITE_UTF16LE;
560 }else if( !strcmp(z, "UTF-16be") ){
561 *piEncoding = SQLITE_UTF16BE;
562 }else if( !strcmp(z, "UTF-8") ){
563 *piEncoding = SQLITE_UTF8;
564 }else{
565 /* This case should not be possible. */
566 *piEncoding = SQLITE_UTF8;
567 }
568 }else{
569 /* This case should not be possible. */
570 *piEncoding = SQLITE_UTF8;
571 }
572
573 /* Require only one result. */
574 rc = sqlite3_step(pStmt);
575 if( rc==SQLITE_DONE ){
576 rc = SQLITE_OK;
577 }else if( rc==SQLITE_ROW ){
578 /* This case should not be possible. */
579 rc = SQLITE_CORRUPT;
580 }
581 }
582 sqlite3_finalize(pStmt);
583 return rc;
584 }
585
586 /* Cursor for iterating interior nodes. Interior page cells contain a
587 * child page number and a rowid. The child page contains items left
588 * of the rowid (less than). The rightmost page of the subtree is
589 * stored in the page header.
590 *
591 * interiorCursorDestroy - release all resources associated with the
592 * cursor and any parent cursors.
593 * interiorCursorCreate - create a cursor with the given parent and page.
594 * interiorCursorEOF - returns true if neither the cursor nor the
595 * parent cursors can return any more data.
596 * interiorCursorNextPage - fetch the next child page from the cursor.
597 *
598 * Logically, interiorCursorNextPage() returns the next child page
599 * number from the page the cursor is currently reading, calling the
600 * parent cursor as necessary to get new pages to read, until done.
601 * SQLITE_ROW if a page is returned, SQLITE_DONE if out of pages,
602 * error otherwise. Unfortunately, if the table is corrupted
603 * unexpected pages can be returned. If any unexpected page is found,
604 * leaf or otherwise, it is returned to the caller for processing,
605 * with the interior cursor left empty. The next call to
606 * interiorCursorNextPage() will recurse to the parent cursor until an
607 * interior page to iterate is returned.
608 *
609 * Note that while interiorCursorNextPage() will refuse to follow
610 * loops, it does not keep track of pages returned for purposes of
611 * preventing duplication.
612 *
613 * Note that interiorCursorEOF() could return false (not at EOF), and
614 * interiorCursorNextPage() could still return SQLITE_DONE. This
615 * could happen if there are more cells to iterate in an interior
616 * page, but those cells refer to invalid pages.
617 */
618 typedef struct RecoverInteriorCursor RecoverInteriorCursor;
619 struct RecoverInteriorCursor {
620 RecoverInteriorCursor *pParent; /* Parent node to this node. */
621 DbPage *pPage; /* Reference to leaf page. */
622 unsigned nPageSize; /* Size of page. */
623 unsigned nChildren; /* Number of children on the page. */
624 unsigned iChild; /* Index of next child to return. */
625 };
626
interiorCursorDestroy(RecoverInteriorCursor * pCursor)627 static void interiorCursorDestroy(RecoverInteriorCursor *pCursor){
628 /* Destroy all the cursors to the root. */
629 while( pCursor ){
630 RecoverInteriorCursor *p = pCursor;
631 pCursor = pCursor->pParent;
632
633 if( p->pPage ){
634 sqlite3PagerUnref(p->pPage);
635 p->pPage = NULL;
636 }
637
638 memset(p, 0xA5, sizeof(*p));
639 sqlite3_free(p);
640 }
641 }
642
643 /* Internal helper. Reset storage in preparation for iterating pPage. */
interiorCursorSetPage(RecoverInteriorCursor * pCursor,DbPage * pPage)644 static void interiorCursorSetPage(RecoverInteriorCursor *pCursor,
645 DbPage *pPage){
646 assert( PageHeader(pPage)[kiPageTypeOffset]==kTableInteriorPage );
647
648 if( pCursor->pPage ){
649 sqlite3PagerUnref(pCursor->pPage);
650 pCursor->pPage = NULL;
651 }
652 pCursor->pPage = pPage;
653 pCursor->iChild = 0;
654
655 /* A child for each cell, plus one in the header. */
656 /* TODO(shess): Sanity-check the count? Page header plus per-cell
657 * cost of 16-bit offset, 32-bit page number, and one varint
658 * (minimum 1 byte).
659 */
660 pCursor->nChildren = decodeUnsigned16(PageHeader(pPage) +
661 kiPageCellCountOffset) + 1;
662 }
663
interiorCursorCreate(RecoverInteriorCursor * pParent,DbPage * pPage,int nPageSize,RecoverInteriorCursor ** ppCursor)664 static int interiorCursorCreate(RecoverInteriorCursor *pParent,
665 DbPage *pPage, int nPageSize,
666 RecoverInteriorCursor **ppCursor){
667 RecoverInteriorCursor *pCursor =
668 sqlite3_malloc(sizeof(RecoverInteriorCursor));
669 if( !pCursor ){
670 return SQLITE_NOMEM;
671 }
672
673 memset(pCursor, 0, sizeof(*pCursor));
674 pCursor->pParent = pParent;
675 pCursor->nPageSize = nPageSize;
676 interiorCursorSetPage(pCursor, pPage);
677 *ppCursor = pCursor;
678 return SQLITE_OK;
679 }
680
681 /* Internal helper. Return the child page number at iChild. */
interiorCursorChildPage(RecoverInteriorCursor * pCursor)682 static unsigned interiorCursorChildPage(RecoverInteriorCursor *pCursor){
683 const unsigned char *pPageHeader; /* Header of the current page. */
684 const unsigned char *pCellOffsets; /* Offset to page's cell offsets. */
685 unsigned iCellOffset; /* Offset of target cell. */
686
687 assert( pCursor->iChild<pCursor->nChildren );
688
689 /* Rightmost child is in the header. */
690 pPageHeader = PageHeader(pCursor->pPage);
691 if( pCursor->iChild==pCursor->nChildren-1 ){
692 return decodeUnsigned32(pPageHeader + kiPageRightChildOffset);
693 }
694
695 /* Each cell is a 4-byte integer page number and a varint rowid
696 * which is greater than the rowid of items in that sub-tree (this
697 * module ignores ordering). The offset is from the beginning of the
698 * page, not from the page header.
699 */
700 pCellOffsets = pPageHeader + kiPageInteriorHeaderBytes;
701 iCellOffset = decodeUnsigned16(pCellOffsets + pCursor->iChild*2);
702 if( iCellOffset<=pCursor->nPageSize-4 ){
703 return decodeUnsigned32(PageData(pCursor->pPage, iCellOffset));
704 }
705
706 /* TODO(shess): Check for cell overlaps? Cells require 4 bytes plus
707 * a varint. Check could be identical to leaf check (or even a
708 * shared helper testing for "Cells starting in this range"?).
709 */
710
711 /* If the offset is broken, return an invalid page number. */
712 return 0;
713 }
714
interiorCursorEOF(RecoverInteriorCursor * pCursor)715 static int interiorCursorEOF(RecoverInteriorCursor *pCursor){
716 /* Find a parent with remaining children. EOF if none found. */
717 while( pCursor && pCursor->iChild>=pCursor->nChildren ){
718 pCursor = pCursor->pParent;
719 }
720 return pCursor==NULL;
721 }
722
723 /* Internal helper. Used to detect if iPage would cause a loop. */
interiorCursorPageInUse(RecoverInteriorCursor * pCursor,unsigned iPage)724 static int interiorCursorPageInUse(RecoverInteriorCursor *pCursor,
725 unsigned iPage){
726 /* Find any parent using the indicated page. */
727 while( pCursor && pCursor->pPage->pgno!=iPage ){
728 pCursor = pCursor->pParent;
729 }
730 return pCursor!=NULL;
731 }
732
733 /* Get the next page from the interior cursor at *ppCursor. Returns
734 * SQLITE_ROW with the page in *ppPage, or SQLITE_DONE if out of
735 * pages, or the error SQLite returned.
736 *
737 * If the tree is uneven, then when the cursor attempts to get a new
738 * interior page from the parent cursor, it may get a non-interior
739 * page. In that case, the new page is returned, and *ppCursor is
740 * updated to point to the parent cursor (this cursor is freed).
741 */
742 /* TODO(shess): I've tried to avoid recursion in most of this code,
743 * but this case is more challenging because the recursive call is in
744 * the middle of operation. One option for converting it without
745 * adding memory management would be to retain the head pointer and
746 * use a helper to "back up" as needed. Another option would be to
747 * reverse the list during traversal.
748 */
interiorCursorNextPage(RecoverInteriorCursor ** ppCursor,DbPage ** ppPage)749 static int interiorCursorNextPage(RecoverInteriorCursor **ppCursor,
750 DbPage **ppPage){
751 RecoverInteriorCursor *pCursor = *ppCursor;
752 while( 1 ){
753 int rc;
754 const unsigned char *pPageHeader; /* Header of found page. */
755
756 /* Find a valid child page which isn't on the stack. */
757 while( pCursor->iChild<pCursor->nChildren ){
758 const unsigned iPage = interiorCursorChildPage(pCursor);
759 pCursor->iChild++;
760 if( interiorCursorPageInUse(pCursor, iPage) ){
761 fprintf(stderr, "Loop detected at %d\n", iPage);
762 }else{
763 int rc = sqlite3PagerAcquire(pCursor->pPage->pPager, iPage, ppPage, 0);
764 if( rc==SQLITE_OK ){
765 return SQLITE_ROW;
766 }
767 }
768 }
769
770 /* This page has no more children. Get next page from parent. */
771 if( !pCursor->pParent ){
772 return SQLITE_DONE;
773 }
774 rc = interiorCursorNextPage(&pCursor->pParent, ppPage);
775 if( rc!=SQLITE_ROW ){
776 return rc;
777 }
778
779 /* If a non-interior page is received, that either means that the
780 * tree is uneven, or that a child was re-used (say as an overflow
781 * page). Remove this cursor and let the caller handle the page.
782 */
783 pPageHeader = PageHeader(*ppPage);
784 if( pPageHeader[kiPageTypeOffset]!=kTableInteriorPage ){
785 *ppCursor = pCursor->pParent;
786 pCursor->pParent = NULL;
787 interiorCursorDestroy(pCursor);
788 return SQLITE_ROW;
789 }
790
791 /* Iterate the new page. */
792 interiorCursorSetPage(pCursor, *ppPage);
793 *ppPage = NULL;
794 }
795
796 assert(NULL); /* NOTREACHED() */
797 return SQLITE_CORRUPT;
798 }
799
800 /* Large rows are spilled to overflow pages. The row's main page
801 * stores the overflow page number after the local payload, with a
802 * linked list forward from there as necessary. overflowMaybeCreate()
803 * and overflowGetSegment() provide an abstraction for accessing such
804 * data while centralizing the code.
805 *
806 * overflowDestroy - releases all resources associated with the structure.
807 * overflowMaybeCreate - create the overflow structure if it is needed
808 * to represent the given record. See function comment.
809 * overflowGetSegment - fetch a segment from the record, accounting
810 * for overflow pages. Segments which are not
811 * entirely contained with a page are constructed
812 * into a buffer which is returned. See function comment.
813 */
814 typedef struct RecoverOverflow RecoverOverflow;
815 struct RecoverOverflow {
816 RecoverOverflow *pNextOverflow;
817 DbPage *pPage;
818 unsigned nPageSize;
819 };
820
overflowDestroy(RecoverOverflow * pOverflow)821 static void overflowDestroy(RecoverOverflow *pOverflow){
822 while( pOverflow ){
823 RecoverOverflow *p = pOverflow;
824 pOverflow = p->pNextOverflow;
825
826 if( p->pPage ){
827 sqlite3PagerUnref(p->pPage);
828 p->pPage = NULL;
829 }
830
831 memset(p, 0xA5, sizeof(*p));
832 sqlite3_free(p);
833 }
834 }
835
836 /* Internal helper. Used to detect if iPage would cause a loop. */
overflowPageInUse(RecoverOverflow * pOverflow,unsigned iPage)837 static int overflowPageInUse(RecoverOverflow *pOverflow, unsigned iPage){
838 while( pOverflow && pOverflow->pPage->pgno!=iPage ){
839 pOverflow = pOverflow->pNextOverflow;
840 }
841 return pOverflow!=NULL;
842 }
843
844 /* Setup to access an nRecordBytes record beginning at iRecordOffset
845 * in pPage. If nRecordBytes can be satisfied entirely from pPage,
846 * then no overflow pages are needed an *pnLocalRecordBytes is set to
847 * nRecordBytes. Otherwise, *ppOverflow is set to the head of a list
848 * of overflow pages, and *pnLocalRecordBytes is set to the number of
849 * bytes local to pPage.
850 *
851 * overflowGetSegment() will do the right thing regardless of whether
852 * those values are set to be in-page or not.
853 */
overflowMaybeCreate(DbPage * pPage,unsigned nPageSize,unsigned iRecordOffset,unsigned nRecordBytes,unsigned * pnLocalRecordBytes,RecoverOverflow ** ppOverflow)854 static int overflowMaybeCreate(DbPage *pPage, unsigned nPageSize,
855 unsigned iRecordOffset, unsigned nRecordBytes,
856 unsigned *pnLocalRecordBytes,
857 RecoverOverflow **ppOverflow){
858 unsigned nLocalRecordBytes; /* Record bytes in the leaf page. */
859 unsigned iNextPage; /* Next page number for record data. */
860 unsigned nBytes; /* Maximum record bytes as of current page. */
861 int rc;
862 RecoverOverflow *pFirstOverflow; /* First in linked list of pages. */
863 RecoverOverflow *pLastOverflow; /* End of linked list. */
864
865 /* Calculations from the "Table B-Tree Leaf Cell" part of section
866 * 1.5 of http://www.sqlite.org/fileformat2.html . maxLocal and
867 * minLocal to match naming in btree.c.
868 */
869 const unsigned maxLocal = nPageSize - 35;
870 const unsigned minLocal = ((nPageSize-12)*32/255)-23; /* m */
871
872 /* Always fit anything smaller than maxLocal. */
873 if( nRecordBytes<=maxLocal ){
874 *pnLocalRecordBytes = nRecordBytes;
875 *ppOverflow = NULL;
876 return SQLITE_OK;
877 }
878
879 /* Calculate the remainder after accounting for minLocal on the leaf
880 * page and what packs evenly into overflow pages. If the remainder
881 * does not fit into maxLocal, then a partially-full overflow page
882 * will be required in any case, so store as little as possible locally.
883 */
884 nLocalRecordBytes = minLocal+((nRecordBytes-minLocal)%(nPageSize-4));
885 if( maxLocal<nLocalRecordBytes ){
886 nLocalRecordBytes = minLocal;
887 }
888
889 /* Don't read off the end of the page. */
890 if( iRecordOffset+nLocalRecordBytes+4>nPageSize ){
891 return SQLITE_CORRUPT;
892 }
893
894 /* First overflow page number is after the local bytes. */
895 iNextPage =
896 decodeUnsigned32(PageData(pPage, iRecordOffset + nLocalRecordBytes));
897 nBytes = nLocalRecordBytes;
898
899 /* While there are more pages to read, and more bytes are needed,
900 * get another page.
901 */
902 pFirstOverflow = pLastOverflow = NULL;
903 rc = SQLITE_OK;
904 while( iNextPage && nBytes<nRecordBytes ){
905 RecoverOverflow *pOverflow; /* New overflow page for the list. */
906
907 rc = sqlite3PagerAcquire(pPage->pPager, iNextPage, &pPage, 0);
908 if( rc!=SQLITE_OK ){
909 break;
910 }
911
912 pOverflow = sqlite3_malloc(sizeof(RecoverOverflow));
913 if( !pOverflow ){
914 sqlite3PagerUnref(pPage);
915 rc = SQLITE_NOMEM;
916 break;
917 }
918 memset(pOverflow, 0, sizeof(*pOverflow));
919 pOverflow->pPage = pPage;
920 pOverflow->nPageSize = nPageSize;
921
922 if( !pFirstOverflow ){
923 pFirstOverflow = pOverflow;
924 }else{
925 pLastOverflow->pNextOverflow = pOverflow;
926 }
927 pLastOverflow = pOverflow;
928
929 iNextPage = decodeUnsigned32(pPage->pData);
930 nBytes += nPageSize-4;
931
932 /* Avoid loops. */
933 if( overflowPageInUse(pFirstOverflow, iNextPage) ){
934 fprintf(stderr, "Overflow loop detected at %d\n", iNextPage);
935 rc = SQLITE_CORRUPT;
936 break;
937 }
938 }
939
940 /* If there were not enough pages, or too many, things are corrupt.
941 * Not having enough pages is an obvious problem, all the data
942 * cannot be read. Too many pages means that the contents of the
943 * row between the main page and the overflow page(s) is
944 * inconsistent (most likely one or more of the overflow pages does
945 * not really belong to this row).
946 */
947 if( rc==SQLITE_OK && (nBytes<nRecordBytes || iNextPage) ){
948 rc = SQLITE_CORRUPT;
949 }
950
951 if( rc==SQLITE_OK ){
952 *ppOverflow = pFirstOverflow;
953 *pnLocalRecordBytes = nLocalRecordBytes;
954 }else if( pFirstOverflow ){
955 overflowDestroy(pFirstOverflow);
956 }
957 return rc;
958 }
959
960 /* Use in concert with overflowMaybeCreate() to efficiently read parts
961 * of a potentially-overflowing record. pPage and iRecordOffset are
962 * the values passed into overflowMaybeCreate(), nLocalRecordBytes and
963 * pOverflow are the values returned by that call.
964 *
965 * On SQLITE_OK, *ppBase points to nRequestBytes of data at
966 * iRequestOffset within the record. If the data exists contiguously
967 * in a page, a direct pointer is returned, otherwise a buffer from
968 * sqlite3_malloc() is returned with the data. *pbFree is set true if
969 * sqlite3_free() should be called on *ppBase.
970 */
971 /* Operation of this function is subtle. At any time, pPage is the
972 * current page, with iRecordOffset and nLocalRecordBytes being record
973 * data within pPage, and pOverflow being the overflow page after
974 * pPage. This allows the code to handle both the initial leaf page
975 * and overflow pages consistently by adjusting the values
976 * appropriately.
977 */
overflowGetSegment(DbPage * pPage,unsigned iRecordOffset,unsigned nLocalRecordBytes,RecoverOverflow * pOverflow,unsigned iRequestOffset,unsigned nRequestBytes,unsigned char ** ppBase,int * pbFree)978 static int overflowGetSegment(DbPage *pPage, unsigned iRecordOffset,
979 unsigned nLocalRecordBytes,
980 RecoverOverflow *pOverflow,
981 unsigned iRequestOffset, unsigned nRequestBytes,
982 unsigned char **ppBase, int *pbFree){
983 unsigned nBase; /* Amount of data currently collected. */
984 unsigned char *pBase; /* Buffer to collect record data into. */
985
986 /* Skip to the page containing the start of the data. */
987 while( iRequestOffset>=nLocalRecordBytes && pOverflow ){
988 /* Factor out current page's contribution. */
989 iRequestOffset -= nLocalRecordBytes;
990
991 /* Move forward to the next page in the list. */
992 pPage = pOverflow->pPage;
993 iRecordOffset = 4;
994 nLocalRecordBytes = pOverflow->nPageSize - iRecordOffset;
995 pOverflow = pOverflow->pNextOverflow;
996 }
997
998 /* If the requested data is entirely within this page, return a
999 * pointer into the page.
1000 */
1001 if( iRequestOffset+nRequestBytes<=nLocalRecordBytes ){
1002 /* TODO(shess): "assignment discards qualifiers from pointer target type"
1003 * Having ppBase be const makes sense, but sqlite3_free() takes non-const.
1004 */
1005 *ppBase = (unsigned char *)PageData(pPage, iRecordOffset + iRequestOffset);
1006 *pbFree = 0;
1007 return SQLITE_OK;
1008 }
1009
1010 /* The data range would require additional pages. */
1011 if( !pOverflow ){
1012 /* Should never happen, the range is outside the nRecordBytes
1013 * passed to overflowMaybeCreate().
1014 */
1015 assert(NULL); /* NOTREACHED */
1016 return SQLITE_ERROR;
1017 }
1018
1019 /* Get a buffer to construct into. */
1020 nBase = 0;
1021 pBase = sqlite3_malloc(nRequestBytes);
1022 if( !pBase ){
1023 return SQLITE_NOMEM;
1024 }
1025 while( nBase<nRequestBytes ){
1026 /* Copy over data present on this page. */
1027 unsigned nCopyBytes = nRequestBytes - nBase;
1028 if( nLocalRecordBytes-iRequestOffset<nCopyBytes ){
1029 nCopyBytes = nLocalRecordBytes - iRequestOffset;
1030 }
1031 memcpy(pBase + nBase, PageData(pPage, iRecordOffset + iRequestOffset),
1032 nCopyBytes);
1033 nBase += nCopyBytes;
1034
1035 if( pOverflow ){
1036 /* Copy from start of record data in future pages. */
1037 iRequestOffset = 0;
1038
1039 /* Move forward to the next page in the list. Should match
1040 * first while() loop.
1041 */
1042 pPage = pOverflow->pPage;
1043 iRecordOffset = 4;
1044 nLocalRecordBytes = pOverflow->nPageSize - iRecordOffset;
1045 pOverflow = pOverflow->pNextOverflow;
1046 }else if( nBase<nRequestBytes ){
1047 /* Ran out of overflow pages with data left to deliver. Not
1048 * possible if the requested range fits within nRecordBytes
1049 * passed to overflowMaybeCreate() when creating pOverflow.
1050 */
1051 assert(NULL); /* NOTREACHED */
1052 sqlite3_free(pBase);
1053 return SQLITE_ERROR;
1054 }
1055 }
1056 assert( nBase==nRequestBytes );
1057 *ppBase = pBase;
1058 *pbFree = 1;
1059 return SQLITE_OK;
1060 }
1061
1062 /* Primary structure for iterating the contents of a table.
1063 *
1064 * leafCursorDestroy - release all resources associated with the cursor.
1065 * leafCursorCreate - create a cursor to iterate items from tree at
1066 * the provided root page.
1067 * leafCursorNextValidCell - get the cursor ready to access data from
1068 * the next valid cell in the table.
1069 * leafCursorCellRowid - get the current cell's rowid.
1070 * leafCursorCellColumns - get current cell's column count.
1071 * leafCursorCellColInfo - get type and data for a column in current cell.
1072 *
1073 * leafCursorNextValidCell skips cells which fail simple integrity
1074 * checks, such as overlapping other cells, or being located at
1075 * impossible offsets, or where header data doesn't correctly describe
1076 * payload data. Returns SQLITE_ROW if a valid cell is found,
1077 * SQLITE_DONE if all pages in the tree were exhausted.
1078 *
1079 * leafCursorCellColInfo() accounts for overflow pages in the style of
1080 * overflowGetSegment().
1081 */
1082 typedef struct RecoverLeafCursor RecoverLeafCursor;
1083 struct RecoverLeafCursor {
1084 RecoverInteriorCursor *pParent; /* Parent node to this node. */
1085 DbPage *pPage; /* Reference to leaf page. */
1086 unsigned nPageSize; /* Size of pPage. */
1087 unsigned nCells; /* Number of cells in pPage. */
1088 unsigned iCell; /* Current cell. */
1089
1090 /* Info parsed from data in iCell. */
1091 i64 iRowid; /* rowid parsed. */
1092 unsigned nRecordCols; /* how many items in the record. */
1093 u64 iRecordOffset; /* offset to record data. */
1094 /* TODO(shess): nRecordBytes and nRecordHeaderBytes are used in
1095 * leafCursorCellColInfo() to prevent buffer overruns.
1096 * leafCursorCellDecode() already verified that the cell is valid, so
1097 * those checks should be redundant.
1098 */
1099 u64 nRecordBytes; /* Size of record data. */
1100 unsigned nLocalRecordBytes; /* Amount of record data in-page. */
1101 unsigned nRecordHeaderBytes; /* Size of record header data. */
1102 unsigned char *pRecordHeader; /* Pointer to record header data. */
1103 int bFreeRecordHeader; /* True if record header requires free. */
1104 RecoverOverflow *pOverflow; /* Cell overflow info, if needed. */
1105 };
1106
1107 /* Internal helper shared between next-page and create-cursor. If
1108 * pPage is a leaf page, it will be stored in the cursor and state
1109 * initialized for reading cells.
1110 *
1111 * If pPage is an interior page, a new parent cursor is created and
1112 * injected on the stack. This is necessary to handle trees with
1113 * uneven depth, but also is used during initial setup.
1114 *
1115 * If pPage is not a table page at all, it is discarded.
1116 *
1117 * If SQLITE_OK is returned, the caller no longer owns pPage,
1118 * otherwise the caller is responsible for discarding it.
1119 */
leafCursorLoadPage(RecoverLeafCursor * pCursor,DbPage * pPage)1120 static int leafCursorLoadPage(RecoverLeafCursor *pCursor, DbPage *pPage){
1121 const unsigned char *pPageHeader; /* Header of *pPage */
1122
1123 /* Release the current page. */
1124 if( pCursor->pPage ){
1125 sqlite3PagerUnref(pCursor->pPage);
1126 pCursor->pPage = NULL;
1127 pCursor->iCell = pCursor->nCells = 0;
1128 }
1129
1130 /* If the page is an unexpected interior node, inject a new stack
1131 * layer and try again from there.
1132 */
1133 pPageHeader = PageHeader(pPage);
1134 if( pPageHeader[kiPageTypeOffset]==kTableInteriorPage ){
1135 RecoverInteriorCursor *pParent;
1136 int rc = interiorCursorCreate(pCursor->pParent, pPage, pCursor->nPageSize,
1137 &pParent);
1138 if( rc!=SQLITE_OK ){
1139 return rc;
1140 }
1141 pCursor->pParent = pParent;
1142 return SQLITE_OK;
1143 }
1144
1145 /* Not a leaf page, skip it. */
1146 if( pPageHeader[kiPageTypeOffset]!=kTableLeafPage ){
1147 sqlite3PagerUnref(pPage);
1148 return SQLITE_OK;
1149 }
1150
1151 /* Take ownership of the page and start decoding. */
1152 pCursor->pPage = pPage;
1153 pCursor->iCell = 0;
1154 pCursor->nCells = decodeUnsigned16(pPageHeader + kiPageCellCountOffset);
1155 return SQLITE_OK;
1156 }
1157
1158 /* Get the next leaf-level page in the tree. Returns SQLITE_ROW when
1159 * a leaf page is found, SQLITE_DONE when no more leaves exist, or any
1160 * error which occurred.
1161 */
leafCursorNextPage(RecoverLeafCursor * pCursor)1162 static int leafCursorNextPage(RecoverLeafCursor *pCursor){
1163 if( !pCursor->pParent ){
1164 return SQLITE_DONE;
1165 }
1166
1167 /* Repeatedly load the parent's next child page until a leaf is found. */
1168 do {
1169 DbPage *pNextPage;
1170 int rc = interiorCursorNextPage(&pCursor->pParent, &pNextPage);
1171 if( rc!=SQLITE_ROW ){
1172 assert( rc==SQLITE_DONE );
1173 return rc;
1174 }
1175
1176 rc = leafCursorLoadPage(pCursor, pNextPage);
1177 if( rc!=SQLITE_OK ){
1178 sqlite3PagerUnref(pNextPage);
1179 return rc;
1180 }
1181 } while( !pCursor->pPage );
1182
1183 return SQLITE_ROW;
1184 }
1185
leafCursorDestroyCellData(RecoverLeafCursor * pCursor)1186 static void leafCursorDestroyCellData(RecoverLeafCursor *pCursor){
1187 if( pCursor->bFreeRecordHeader ){
1188 sqlite3_free(pCursor->pRecordHeader);
1189 }
1190 pCursor->bFreeRecordHeader = 0;
1191 pCursor->pRecordHeader = NULL;
1192
1193 if( pCursor->pOverflow ){
1194 overflowDestroy(pCursor->pOverflow);
1195 pCursor->pOverflow = NULL;
1196 }
1197 }
1198
leafCursorDestroy(RecoverLeafCursor * pCursor)1199 static void leafCursorDestroy(RecoverLeafCursor *pCursor){
1200 leafCursorDestroyCellData(pCursor);
1201
1202 if( pCursor->pParent ){
1203 interiorCursorDestroy(pCursor->pParent);
1204 pCursor->pParent = NULL;
1205 }
1206
1207 if( pCursor->pPage ){
1208 sqlite3PagerUnref(pCursor->pPage);
1209 pCursor->pPage = NULL;
1210 }
1211
1212 memset(pCursor, 0xA5, sizeof(*pCursor));
1213 sqlite3_free(pCursor);
1214 }
1215
1216 /* Create a cursor to iterate the rows from the leaf pages of a table
1217 * rooted at iRootPage.
1218 */
1219 /* TODO(shess): recoverOpen() calls this to setup the cursor, and I
1220 * think that recoverFilter() may make a hard assumption that the
1221 * cursor returned will turn up at least one valid cell.
1222 *
1223 * The cases I can think of which break this assumption are:
1224 * - pPage is a valid leaf page with no valid cells.
1225 * - pPage is a valid interior page with no valid leaves.
1226 * - pPage is a valid interior page who's leaves contain no valid cells.
1227 * - pPage is not a valid leaf or interior page.
1228 */
leafCursorCreate(Pager * pPager,unsigned nPageSize,u32 iRootPage,RecoverLeafCursor ** ppCursor)1229 static int leafCursorCreate(Pager *pPager, unsigned nPageSize,
1230 u32 iRootPage, RecoverLeafCursor **ppCursor){
1231 DbPage *pPage; /* Reference to page at iRootPage. */
1232 RecoverLeafCursor *pCursor; /* Leaf cursor being constructed. */
1233 int rc;
1234
1235 /* Start out with the root page. */
1236 rc = sqlite3PagerAcquire(pPager, iRootPage, &pPage, 0);
1237 if( rc!=SQLITE_OK ){
1238 return rc;
1239 }
1240
1241 pCursor = sqlite3_malloc(sizeof(RecoverLeafCursor));
1242 if( !pCursor ){
1243 sqlite3PagerUnref(pPage);
1244 return SQLITE_NOMEM;
1245 }
1246 memset(pCursor, 0, sizeof(*pCursor));
1247
1248 pCursor->nPageSize = nPageSize;
1249
1250 rc = leafCursorLoadPage(pCursor, pPage);
1251 if( rc!=SQLITE_OK ){
1252 sqlite3PagerUnref(pPage);
1253 leafCursorDestroy(pCursor);
1254 return rc;
1255 }
1256
1257 /* pPage wasn't a leaf page, find the next leaf page. */
1258 if( !pCursor->pPage ){
1259 rc = leafCursorNextPage(pCursor);
1260 if( rc!=SQLITE_DONE && rc!=SQLITE_ROW ){
1261 leafCursorDestroy(pCursor);
1262 return rc;
1263 }
1264 }
1265
1266 *ppCursor = pCursor;
1267 return SQLITE_OK;
1268 }
1269
1270 /* Useful for setting breakpoints. */
ValidateError()1271 static int ValidateError(){
1272 return SQLITE_ERROR;
1273 }
1274
1275 /* Setup the cursor for reading the information from cell iCell. */
leafCursorCellDecode(RecoverLeafCursor * pCursor)1276 static int leafCursorCellDecode(RecoverLeafCursor *pCursor){
1277 const unsigned char *pPageHeader; /* Header of current page. */
1278 const unsigned char *pPageEnd; /* Byte after end of current page. */
1279 const unsigned char *pCellOffsets; /* Pointer to page's cell offsets. */
1280 unsigned iCellOffset; /* Offset of current cell (iCell). */
1281 const unsigned char *pCell; /* Pointer to data at iCellOffset. */
1282 unsigned nCellMaxBytes; /* Maximum local size of iCell. */
1283 unsigned iEndOffset; /* End of iCell's in-page data. */
1284 u64 nRecordBytes; /* Expected size of cell, w/overflow. */
1285 u64 iRowid; /* iCell's rowid (in table). */
1286 unsigned nRead; /* Amount of cell read. */
1287 unsigned nRecordHeaderRead; /* Header data read. */
1288 u64 nRecordHeaderBytes; /* Header size expected. */
1289 unsigned nRecordCols; /* Columns read from header. */
1290 u64 nRecordColBytes; /* Bytes in payload for those columns. */
1291 unsigned i;
1292 int rc;
1293
1294 assert( pCursor->iCell<pCursor->nCells );
1295
1296 leafCursorDestroyCellData(pCursor);
1297
1298 /* Find the offset to the row. */
1299 pPageHeader = PageHeader(pCursor->pPage);
1300 pCellOffsets = pPageHeader + knPageLeafHeaderBytes;
1301 pPageEnd = PageData(pCursor->pPage, pCursor->nPageSize);
1302 if( pCellOffsets + pCursor->iCell*2 + 2 > pPageEnd ){
1303 return ValidateError();
1304 }
1305 iCellOffset = decodeUnsigned16(pCellOffsets + pCursor->iCell*2);
1306 if( iCellOffset>=pCursor->nPageSize ){
1307 return ValidateError();
1308 }
1309
1310 pCell = PageData(pCursor->pPage, iCellOffset);
1311 nCellMaxBytes = pCursor->nPageSize - iCellOffset;
1312
1313 /* B-tree leaf cells lead with varint record size, varint rowid and
1314 * varint header size.
1315 */
1316 /* TODO(shess): The smallest page size is 512 bytes, which has an m
1317 * of 39. Three varints need at most 27 bytes to encode. I think.
1318 */
1319 if( !checkVarints(pCell, nCellMaxBytes, 3) ){
1320 return ValidateError();
1321 }
1322
1323 nRead = getVarint(pCell, &nRecordBytes);
1324 assert( iCellOffset+nRead<=pCursor->nPageSize );
1325 pCursor->nRecordBytes = nRecordBytes;
1326
1327 nRead += getVarint(pCell + nRead, &iRowid);
1328 assert( iCellOffset+nRead<=pCursor->nPageSize );
1329 pCursor->iRowid = (i64)iRowid;
1330
1331 pCursor->iRecordOffset = iCellOffset + nRead;
1332
1333 /* Start overflow setup here because nLocalRecordBytes is needed to
1334 * check cell overlap.
1335 */
1336 rc = overflowMaybeCreate(pCursor->pPage, pCursor->nPageSize,
1337 pCursor->iRecordOffset, pCursor->nRecordBytes,
1338 &pCursor->nLocalRecordBytes,
1339 &pCursor->pOverflow);
1340 if( rc!=SQLITE_OK ){
1341 return ValidateError();
1342 }
1343
1344 /* Check that no other cell starts within this cell. */
1345 iEndOffset = pCursor->iRecordOffset + pCursor->nLocalRecordBytes;
1346 for( i=0; i<pCursor->nCells && pCellOffsets + i*2 + 2 <= pPageEnd; ++i ){
1347 const unsigned iOtherOffset = decodeUnsigned16(pCellOffsets + i*2);
1348 if( iOtherOffset>iCellOffset && iOtherOffset<iEndOffset ){
1349 return ValidateError();
1350 }
1351 }
1352
1353 nRecordHeaderRead = getVarint(pCell + nRead, &nRecordHeaderBytes);
1354 assert( nRecordHeaderBytes<=nRecordBytes );
1355 pCursor->nRecordHeaderBytes = nRecordHeaderBytes;
1356
1357 /* Large headers could overflow if pages are small. */
1358 rc = overflowGetSegment(pCursor->pPage,
1359 pCursor->iRecordOffset, pCursor->nLocalRecordBytes,
1360 pCursor->pOverflow, 0, nRecordHeaderBytes,
1361 &pCursor->pRecordHeader, &pCursor->bFreeRecordHeader);
1362 if( rc!=SQLITE_OK ){
1363 return ValidateError();
1364 }
1365
1366 /* Tally up the column count and size of data. */
1367 nRecordCols = 0;
1368 nRecordColBytes = 0;
1369 while( nRecordHeaderRead<nRecordHeaderBytes ){
1370 u64 iSerialType; /* Type descriptor for current column. */
1371 if( !checkVarint(pCursor->pRecordHeader + nRecordHeaderRead,
1372 nRecordHeaderBytes - nRecordHeaderRead) ){
1373 return ValidateError();
1374 }
1375 nRecordHeaderRead += getVarint(pCursor->pRecordHeader + nRecordHeaderRead,
1376 &iSerialType);
1377 if( iSerialType==10 || iSerialType==11 ){
1378 return ValidateError();
1379 }
1380 nRecordColBytes += SerialTypeLength(iSerialType);
1381 nRecordCols++;
1382 }
1383 pCursor->nRecordCols = nRecordCols;
1384
1385 /* Parsing the header used as many bytes as expected. */
1386 if( nRecordHeaderRead!=nRecordHeaderBytes ){
1387 return ValidateError();
1388 }
1389
1390 /* Calculated record is size of expected record. */
1391 if( nRecordHeaderBytes+nRecordColBytes!=nRecordBytes ){
1392 return ValidateError();
1393 }
1394
1395 return SQLITE_OK;
1396 }
1397
leafCursorCellRowid(RecoverLeafCursor * pCursor)1398 static i64 leafCursorCellRowid(RecoverLeafCursor *pCursor){
1399 return pCursor->iRowid;
1400 }
1401
leafCursorCellColumns(RecoverLeafCursor * pCursor)1402 static unsigned leafCursorCellColumns(RecoverLeafCursor *pCursor){
1403 return pCursor->nRecordCols;
1404 }
1405
1406 /* Get the column info for the cell. Pass NULL for ppBase to prevent
1407 * retrieving the data segment. If *pbFree is true, *ppBase must be
1408 * freed by the caller using sqlite3_free().
1409 */
leafCursorCellColInfo(RecoverLeafCursor * pCursor,unsigned iCol,u64 * piColType,unsigned char ** ppBase,int * pbFree)1410 static int leafCursorCellColInfo(RecoverLeafCursor *pCursor,
1411 unsigned iCol, u64 *piColType,
1412 unsigned char **ppBase, int *pbFree){
1413 const unsigned char *pRecordHeader; /* Current cell's header. */
1414 u64 nRecordHeaderBytes; /* Bytes in pRecordHeader. */
1415 unsigned nRead; /* Bytes read from header. */
1416 u64 iColEndOffset; /* Offset to end of column in cell. */
1417 unsigned nColsSkipped; /* Count columns as procesed. */
1418 u64 iSerialType; /* Type descriptor for current column. */
1419
1420 /* Implicit NULL for columns past the end. This case happens when
1421 * rows have not been updated since an ALTER TABLE added columns.
1422 * It is more convenient to address here than in callers.
1423 */
1424 if( iCol>=pCursor->nRecordCols ){
1425 *piColType = 0;
1426 if( ppBase ){
1427 *ppBase = 0;
1428 *pbFree = 0;
1429 }
1430 return SQLITE_OK;
1431 }
1432
1433 /* Must be able to decode header size. */
1434 pRecordHeader = pCursor->pRecordHeader;
1435 if( !checkVarint(pRecordHeader, pCursor->nRecordHeaderBytes) ){
1436 return SQLITE_CORRUPT;
1437 }
1438
1439 /* Rather than caching the header size and how many bytes it took,
1440 * decode it every time.
1441 */
1442 nRead = getVarint(pRecordHeader, &nRecordHeaderBytes);
1443 assert( nRecordHeaderBytes==pCursor->nRecordHeaderBytes );
1444
1445 /* Scan forward to the indicated column. Scans to _after_ column
1446 * for later range checking.
1447 */
1448 /* TODO(shess): This could get expensive for very wide tables. An
1449 * array of iSerialType could be built in leafCursorCellDecode(), but
1450 * the number of columns is dynamic per row, so it would add memory
1451 * management complexity. Enough info to efficiently forward
1452 * iterate could be kept, if all clients forward iterate
1453 * (recoverColumn() may not).
1454 */
1455 iColEndOffset = 0;
1456 nColsSkipped = 0;
1457 while( nColsSkipped<=iCol && nRead<nRecordHeaderBytes ){
1458 if( !checkVarint(pRecordHeader + nRead, nRecordHeaderBytes - nRead) ){
1459 return SQLITE_CORRUPT;
1460 }
1461 nRead += getVarint(pRecordHeader + nRead, &iSerialType);
1462 iColEndOffset += SerialTypeLength(iSerialType);
1463 nColsSkipped++;
1464 }
1465
1466 /* Column's data extends past record's end. */
1467 if( nRecordHeaderBytes+iColEndOffset>pCursor->nRecordBytes ){
1468 return SQLITE_CORRUPT;
1469 }
1470
1471 *piColType = iSerialType;
1472 if( ppBase ){
1473 const u32 nColBytes = SerialTypeLength(iSerialType);
1474
1475 /* Offset from start of record to beginning of column. */
1476 const unsigned iColOffset = nRecordHeaderBytes+iColEndOffset-nColBytes;
1477
1478 return overflowGetSegment(pCursor->pPage, pCursor->iRecordOffset,
1479 pCursor->nLocalRecordBytes, pCursor->pOverflow,
1480 iColOffset, nColBytes, ppBase, pbFree);
1481 }
1482 return SQLITE_OK;
1483 }
1484
leafCursorNextValidCell(RecoverLeafCursor * pCursor)1485 static int leafCursorNextValidCell(RecoverLeafCursor *pCursor){
1486 while( 1 ){
1487 int rc;
1488
1489 /* Move to the next cell. */
1490 pCursor->iCell++;
1491
1492 /* No more cells, get the next leaf. */
1493 if( pCursor->iCell>=pCursor->nCells ){
1494 rc = leafCursorNextPage(pCursor);
1495 if( rc!=SQLITE_ROW ){
1496 return rc;
1497 }
1498 assert( pCursor->iCell==0 );
1499 }
1500
1501 /* If the cell is valid, indicate that a row is available. */
1502 rc = leafCursorCellDecode(pCursor);
1503 if( rc==SQLITE_OK ){
1504 return SQLITE_ROW;
1505 }
1506
1507 /* Iterate until done or a valid row is found. */
1508 /* TODO(shess): Remove debugging output. */
1509 fprintf(stderr, "Skipping invalid cell\n");
1510 }
1511 return SQLITE_ERROR;
1512 }
1513
1514 typedef struct Recover Recover;
1515 struct Recover {
1516 sqlite3_vtab base;
1517 sqlite3 *db; /* Host database connection */
1518 char *zDb; /* Database containing target table */
1519 char *zTable; /* Target table */
1520 unsigned nCols; /* Number of columns in target table */
1521 unsigned char *pTypes; /* Types of columns in target table */
1522 };
1523
1524 /* Internal helper for deleting the module. */
recoverRelease(Recover * pRecover)1525 static void recoverRelease(Recover *pRecover){
1526 sqlite3_free(pRecover->zDb);
1527 sqlite3_free(pRecover->zTable);
1528 sqlite3_free(pRecover->pTypes);
1529 memset(pRecover, 0xA5, sizeof(*pRecover));
1530 sqlite3_free(pRecover);
1531 }
1532
1533 /* Helper function for initializing the module. Forward-declared so
1534 * recoverCreate() and recoverConnect() can see it.
1535 */
1536 static int recoverInit(
1537 sqlite3 *, void *, int, const char *const*, sqlite3_vtab **, char **
1538 );
1539
recoverCreate(sqlite3 * db,void * pAux,int argc,const char * const * argv,sqlite3_vtab ** ppVtab,char ** pzErr)1540 static int recoverCreate(
1541 sqlite3 *db,
1542 void *pAux,
1543 int argc, const char *const*argv,
1544 sqlite3_vtab **ppVtab,
1545 char **pzErr
1546 ){
1547 FNENTRY();
1548 return recoverInit(db, pAux, argc, argv, ppVtab, pzErr);
1549 }
1550
1551 /* This should never be called. */
recoverConnect(sqlite3 * db,void * pAux,int argc,const char * const * argv,sqlite3_vtab ** ppVtab,char ** pzErr)1552 static int recoverConnect(
1553 sqlite3 *db,
1554 void *pAux,
1555 int argc, const char *const*argv,
1556 sqlite3_vtab **ppVtab,
1557 char **pzErr
1558 ){
1559 FNENTRY();
1560 return recoverInit(db, pAux, argc, argv, ppVtab, pzErr);
1561 }
1562
1563 /* No indices supported. */
recoverBestIndex(sqlite3_vtab * tab,sqlite3_index_info * pIdxInfo)1564 static int recoverBestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){
1565 FNENTRY();
1566 return SQLITE_OK;
1567 }
1568
1569 /* Logically, this should never be called. */
recoverDisconnect(sqlite3_vtab * pVtab)1570 static int recoverDisconnect(sqlite3_vtab *pVtab){
1571 FNENTRY();
1572 recoverRelease((Recover*)pVtab);
1573 return SQLITE_OK;
1574 }
1575
recoverDestroy(sqlite3_vtab * pVtab)1576 static int recoverDestroy(sqlite3_vtab *pVtab){
1577 FNENTRY();
1578 recoverRelease((Recover*)pVtab);
1579 return SQLITE_OK;
1580 }
1581
1582 typedef struct RecoverCursor RecoverCursor;
1583 struct RecoverCursor {
1584 sqlite3_vtab_cursor base;
1585 RecoverLeafCursor *pLeafCursor;
1586 int iEncoding;
1587 int bEOF;
1588 };
1589
recoverOpen(sqlite3_vtab * pVTab,sqlite3_vtab_cursor ** ppCursor)1590 static int recoverOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
1591 Recover *pRecover = (Recover*)pVTab;
1592 u32 iRootPage; /* Root page of the backing table. */
1593 int iEncoding; /* UTF encoding for backing database. */
1594 unsigned nPageSize; /* Size of pages in backing database. */
1595 Pager *pPager; /* Backing database pager. */
1596 RecoverLeafCursor *pLeafCursor; /* Cursor to read table's leaf pages. */
1597 RecoverCursor *pCursor; /* Cursor to read rows from leaves. */
1598 int rc;
1599
1600 FNENTRY();
1601
1602 iRootPage = 0;
1603 rc = getRootPage(pRecover->db, pRecover->zDb, pRecover->zTable,
1604 &iRootPage);
1605 if( rc!=SQLITE_OK ){
1606 return rc;
1607 }
1608
1609 iEncoding = 0;
1610 rc = getEncoding(pRecover->db, pRecover->zDb, &iEncoding);
1611 if( rc!=SQLITE_OK ){
1612 return rc;
1613 }
1614
1615 rc = GetPager(pRecover->db, pRecover->zDb, &pPager, &nPageSize);
1616 if( rc!=SQLITE_OK ){
1617 return rc;
1618 }
1619
1620 rc = leafCursorCreate(pPager, nPageSize, iRootPage, &pLeafCursor);
1621 if( rc!=SQLITE_OK ){
1622 return rc;
1623 }
1624
1625 pCursor = sqlite3_malloc(sizeof(RecoverCursor));
1626 if( !pCursor ){
1627 leafCursorDestroy(pLeafCursor);
1628 return SQLITE_NOMEM;
1629 }
1630 memset(pCursor, 0, sizeof(*pCursor));
1631 pCursor->base.pVtab = pVTab;
1632 pCursor->pLeafCursor = pLeafCursor;
1633 pCursor->iEncoding = iEncoding;
1634
1635 /* If no leaf pages were found, empty result set. */
1636 /* TODO(shess): leafCursorNextValidCell() would return SQLITE_ROW or
1637 * SQLITE_DONE to indicate whether there is further data to consider.
1638 */
1639 pCursor->bEOF = (pLeafCursor->pPage==NULL);
1640
1641 *ppCursor = (sqlite3_vtab_cursor*)pCursor;
1642 return SQLITE_OK;
1643 }
1644
recoverClose(sqlite3_vtab_cursor * cur)1645 static int recoverClose(sqlite3_vtab_cursor *cur){
1646 RecoverCursor *pCursor = (RecoverCursor*)cur;
1647 FNENTRY();
1648 if( pCursor->pLeafCursor ){
1649 leafCursorDestroy(pCursor->pLeafCursor);
1650 pCursor->pLeafCursor = NULL;
1651 }
1652 memset(pCursor, 0xA5, sizeof(*pCursor));
1653 sqlite3_free(cur);
1654 return SQLITE_OK;
1655 }
1656
1657 /* Helpful place to set a breakpoint. */
RecoverInvalidCell()1658 static int RecoverInvalidCell(){
1659 return SQLITE_ERROR;
1660 }
1661
1662 /* Returns SQLITE_OK if the cell has an appropriate number of columns
1663 * with the appropriate types of data.
1664 */
recoverValidateLeafCell(Recover * pRecover,RecoverCursor * pCursor)1665 static int recoverValidateLeafCell(Recover *pRecover, RecoverCursor *pCursor){
1666 unsigned i;
1667
1668 /* If the row's storage has too many columns, skip it. */
1669 if( leafCursorCellColumns(pCursor->pLeafCursor)>pRecover->nCols ){
1670 return RecoverInvalidCell();
1671 }
1672
1673 /* Skip rows with unexpected types. */
1674 for( i=0; i<pRecover->nCols; ++i ){
1675 u64 iType; /* Storage type of column i. */
1676 int rc;
1677
1678 /* ROWID alias. */
1679 if( (pRecover->pTypes[i]&MASK_ROWID) ){
1680 continue;
1681 }
1682
1683 rc = leafCursorCellColInfo(pCursor->pLeafCursor, i, &iType, NULL, NULL);
1684 assert( rc==SQLITE_OK );
1685 if( rc!=SQLITE_OK || !SerialTypeIsCompatible(iType, pRecover->pTypes[i]) ){
1686 return RecoverInvalidCell();
1687 }
1688 }
1689
1690 return SQLITE_OK;
1691 }
1692
recoverNext(sqlite3_vtab_cursor * pVtabCursor)1693 static int recoverNext(sqlite3_vtab_cursor *pVtabCursor){
1694 RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1695 Recover *pRecover = (Recover*)pCursor->base.pVtab;
1696 int rc;
1697
1698 FNENTRY();
1699
1700 /* Scan forward to the next cell with valid storage, then check that
1701 * the stored data matches the schema.
1702 */
1703 while( (rc = leafCursorNextValidCell(pCursor->pLeafCursor))==SQLITE_ROW ){
1704 if( recoverValidateLeafCell(pRecover, pCursor)==SQLITE_OK ){
1705 return SQLITE_OK;
1706 }
1707 }
1708
1709 if( rc==SQLITE_DONE ){
1710 pCursor->bEOF = 1;
1711 return SQLITE_OK;
1712 }
1713
1714 assert( rc!=SQLITE_OK );
1715 return rc;
1716 }
1717
recoverFilter(sqlite3_vtab_cursor * pVtabCursor,int idxNum,const char * idxStr,int argc,sqlite3_value ** argv)1718 static int recoverFilter(
1719 sqlite3_vtab_cursor *pVtabCursor,
1720 int idxNum, const char *idxStr,
1721 int argc, sqlite3_value **argv
1722 ){
1723 RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1724 Recover *pRecover = (Recover*)pCursor->base.pVtab;
1725 int rc;
1726
1727 FNENTRY();
1728
1729 /* There were no valid leaf pages in the table. */
1730 if( pCursor->bEOF ){
1731 return SQLITE_OK;
1732 }
1733
1734 /* Load the first cell, and iterate forward if it's not valid. If no cells at
1735 * all are valid, recoverNext() sets bEOF and returns appropriately.
1736 */
1737 rc = leafCursorCellDecode(pCursor->pLeafCursor);
1738 if( rc!=SQLITE_OK || recoverValidateLeafCell(pRecover, pCursor)!=SQLITE_OK ){
1739 return recoverNext(pVtabCursor);
1740 }
1741
1742 return SQLITE_OK;
1743 }
1744
recoverEof(sqlite3_vtab_cursor * pVtabCursor)1745 static int recoverEof(sqlite3_vtab_cursor *pVtabCursor){
1746 RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1747 FNENTRY();
1748 return pCursor->bEOF;
1749 }
1750
recoverColumn(sqlite3_vtab_cursor * cur,sqlite3_context * ctx,int i)1751 static int recoverColumn(sqlite3_vtab_cursor *cur, sqlite3_context *ctx, int i){
1752 RecoverCursor *pCursor = (RecoverCursor*)cur;
1753 Recover *pRecover = (Recover*)pCursor->base.pVtab;
1754 u64 iColType; /* Storage type of column i. */
1755 unsigned char *pColData; /* Column i's data. */
1756 int shouldFree; /* Non-zero if pColData should be freed. */
1757 int rc;
1758
1759 FNENTRY();
1760
1761 if( i>=pRecover->nCols ){
1762 return SQLITE_ERROR;
1763 }
1764
1765 /* ROWID alias. */
1766 if( (pRecover->pTypes[i]&MASK_ROWID) ){
1767 sqlite3_result_int64(ctx, leafCursorCellRowid(pCursor->pLeafCursor));
1768 return SQLITE_OK;
1769 }
1770
1771 pColData = NULL;
1772 shouldFree = 0;
1773 rc = leafCursorCellColInfo(pCursor->pLeafCursor, i, &iColType,
1774 &pColData, &shouldFree);
1775 if( rc!=SQLITE_OK ){
1776 return rc;
1777 }
1778 /* recoverValidateLeafCell() should guarantee that this will never
1779 * occur.
1780 */
1781 if( !SerialTypeIsCompatible(iColType, pRecover->pTypes[i]) ){
1782 if( shouldFree ){
1783 sqlite3_free(pColData);
1784 }
1785 return SQLITE_ERROR;
1786 }
1787
1788 switch( iColType ){
1789 case 0 : sqlite3_result_null(ctx); break;
1790 case 1 : sqlite3_result_int64(ctx, decodeSigned(pColData, 1)); break;
1791 case 2 : sqlite3_result_int64(ctx, decodeSigned(pColData, 2)); break;
1792 case 3 : sqlite3_result_int64(ctx, decodeSigned(pColData, 3)); break;
1793 case 4 : sqlite3_result_int64(ctx, decodeSigned(pColData, 4)); break;
1794 case 5 : sqlite3_result_int64(ctx, decodeSigned(pColData, 6)); break;
1795 case 6 : sqlite3_result_int64(ctx, decodeSigned(pColData, 8)); break;
1796 case 7 : sqlite3_result_double(ctx, decodeFloat64(pColData)); break;
1797 case 8 : sqlite3_result_int(ctx, 0); break;
1798 case 9 : sqlite3_result_int(ctx, 1); break;
1799 case 10 : assert( iColType!=10 ); break;
1800 case 11 : assert( iColType!=11 ); break;
1801
1802 default : {
1803 u32 l = SerialTypeLength(iColType);
1804
1805 /* If pColData was already allocated, arrange to pass ownership. */
1806 sqlite3_destructor_type pFn = SQLITE_TRANSIENT;
1807 if( shouldFree ){
1808 pFn = sqlite3_free;
1809 shouldFree = 0;
1810 }
1811
1812 if( SerialTypeIsBlob(iColType) ){
1813 sqlite3_result_blob(ctx, pColData, l, pFn);
1814 }else{
1815 if( pCursor->iEncoding==SQLITE_UTF16LE ){
1816 sqlite3_result_text16le(ctx, (const void*)pColData, l, pFn);
1817 }else if( pCursor->iEncoding==SQLITE_UTF16BE ){
1818 sqlite3_result_text16be(ctx, (const void*)pColData, l, pFn);
1819 }else{
1820 sqlite3_result_text(ctx, (const char*)pColData, l, pFn);
1821 }
1822 }
1823 } break;
1824 }
1825 if( shouldFree ){
1826 sqlite3_free(pColData);
1827 }
1828 return SQLITE_OK;
1829 }
1830
recoverRowid(sqlite3_vtab_cursor * pVtabCursor,sqlite_int64 * pRowid)1831 static int recoverRowid(sqlite3_vtab_cursor *pVtabCursor, sqlite_int64 *pRowid){
1832 RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1833 FNENTRY();
1834 *pRowid = leafCursorCellRowid(pCursor->pLeafCursor);
1835 return SQLITE_OK;
1836 }
1837
1838 static sqlite3_module recoverModule = {
1839 0, /* iVersion */
1840 recoverCreate, /* xCreate - create a table */
1841 recoverConnect, /* xConnect - connect to an existing table */
1842 recoverBestIndex, /* xBestIndex - Determine search strategy */
1843 recoverDisconnect, /* xDisconnect - Disconnect from a table */
1844 recoverDestroy, /* xDestroy - Drop a table */
1845 recoverOpen, /* xOpen - open a cursor */
1846 recoverClose, /* xClose - close a cursor */
1847 recoverFilter, /* xFilter - configure scan constraints */
1848 recoverNext, /* xNext - advance a cursor */
1849 recoverEof, /* xEof */
1850 recoverColumn, /* xColumn - read data */
1851 recoverRowid, /* xRowid - read data */
1852 0, /* xUpdate - write data */
1853 0, /* xBegin - begin transaction */
1854 0, /* xSync - sync transaction */
1855 0, /* xCommit - commit transaction */
1856 0, /* xRollback - rollback transaction */
1857 0, /* xFindFunction - function overloading */
1858 0, /* xRename - rename the table */
1859 };
1860
recoverVtableInit(sqlite3 * db)1861 int recoverVtableInit(sqlite3 *db){
1862 return sqlite3_create_module_v2(db, "recover", &recoverModule, NULL, 0);
1863 }
1864
1865 /* This section of code is for parsing the create input and
1866 * initializing the module.
1867 */
1868
1869 /* Find the next word in zText and place the endpoints in pzWord*.
1870 * Returns true if the word is non-empty. "Word" is defined as
1871 * ASCII alphanumeric plus '_' at this time.
1872 */
findWord(const char * zText,const char ** pzWordStart,const char ** pzWordEnd)1873 static int findWord(const char *zText,
1874 const char **pzWordStart, const char **pzWordEnd){
1875 int r;
1876 while( ascii_isspace(*zText) ){
1877 zText++;
1878 }
1879 *pzWordStart = zText;
1880 while( ascii_isalnum(*zText) || *zText=='_' ){
1881 zText++;
1882 }
1883 r = zText>*pzWordStart; /* In case pzWordStart==pzWordEnd */
1884 *pzWordEnd = zText;
1885 return r;
1886 }
1887
1888 /* Return true if the next word in zText is zWord, also setting
1889 * *pzContinue to the character after the word.
1890 */
expectWord(const char * zText,const char * zWord,const char ** pzContinue)1891 static int expectWord(const char *zText, const char *zWord,
1892 const char **pzContinue){
1893 const char *zWordStart, *zWordEnd;
1894 if( findWord(zText, &zWordStart, &zWordEnd) &&
1895 ascii_strncasecmp(zWord, zWordStart, zWordEnd - zWordStart)==0 ){
1896 *pzContinue = zWordEnd;
1897 return 1;
1898 }
1899 return 0;
1900 }
1901
1902 /* Parse the name and type information out of parameter. In case of
1903 * success, *pzNameStart/End contain the name of the column,
1904 * *pzTypeStart/End contain the top-level type, and *pTypeMask has the
1905 * type mask to use for the column.
1906 */
findNameAndType(const char * parameter,const char ** pzNameStart,const char ** pzNameEnd,const char ** pzTypeStart,const char ** pzTypeEnd,unsigned char * pTypeMask)1907 static int findNameAndType(const char *parameter,
1908 const char **pzNameStart, const char **pzNameEnd,
1909 const char **pzTypeStart, const char **pzTypeEnd,
1910 unsigned char *pTypeMask){
1911 unsigned nNameLen; /* Length of found name. */
1912 const char *zEnd; /* Current end of parsed column information. */
1913 int bNotNull; /* Non-zero if NULL is not allowed for name. */
1914 int bStrict; /* Non-zero if column requires exact type match. */
1915 const char *zDummy; /* Dummy parameter, result unused. */
1916 unsigned i;
1917
1918 /* strictMask is used for STRICT, strictMask|otherMask if STRICT is
1919 * not supplied. zReplace provides an alternate type to expose to
1920 * the caller.
1921 */
1922 static struct {
1923 const char *zName;
1924 unsigned char strictMask;
1925 unsigned char otherMask;
1926 const char *zReplace;
1927 } kTypeInfo[] = {
1928 { "ANY",
1929 MASK_INTEGER | MASK_FLOAT | MASK_BLOB | MASK_TEXT | MASK_NULL,
1930 0, "",
1931 },
1932 { "ROWID", MASK_INTEGER | MASK_ROWID, 0, "INTEGER", },
1933 { "INTEGER", MASK_INTEGER | MASK_NULL, 0, NULL, },
1934 { "FLOAT", MASK_FLOAT | MASK_NULL, MASK_INTEGER, NULL, },
1935 { "NUMERIC", MASK_INTEGER | MASK_FLOAT | MASK_NULL, MASK_TEXT, NULL, },
1936 { "TEXT", MASK_TEXT | MASK_NULL, MASK_BLOB, NULL, },
1937 { "BLOB", MASK_BLOB | MASK_NULL, 0, NULL, },
1938 };
1939
1940 if( !findWord(parameter, pzNameStart, pzNameEnd) ){
1941 return SQLITE_MISUSE;
1942 }
1943
1944 /* Manifest typing, accept any storage type. */
1945 if( !findWord(*pzNameEnd, pzTypeStart, pzTypeEnd) ){
1946 *pzTypeEnd = *pzTypeStart = "";
1947 *pTypeMask = MASK_INTEGER | MASK_FLOAT | MASK_BLOB | MASK_TEXT | MASK_NULL;
1948 return SQLITE_OK;
1949 }
1950
1951 nNameLen = *pzTypeEnd - *pzTypeStart;
1952 for( i=0; i<ArraySize(kTypeInfo); ++i ){
1953 if( ascii_strncasecmp(kTypeInfo[i].zName, *pzTypeStart, nNameLen)==0 ){
1954 break;
1955 }
1956 }
1957 if( i==ArraySize(kTypeInfo) ){
1958 return SQLITE_MISUSE;
1959 }
1960
1961 zEnd = *pzTypeEnd;
1962 bStrict = 0;
1963 if( expectWord(zEnd, "STRICT", &zEnd) ){
1964 /* TODO(shess): Ick. But I don't want another single-purpose
1965 * flag, either.
1966 */
1967 if( kTypeInfo[i].zReplace && !kTypeInfo[i].zReplace[0] ){
1968 return SQLITE_MISUSE;
1969 }
1970 bStrict = 1;
1971 }
1972
1973 bNotNull = 0;
1974 if( expectWord(zEnd, "NOT", &zEnd) ){
1975 if( expectWord(zEnd, "NULL", &zEnd) ){
1976 bNotNull = 1;
1977 }else{
1978 /* Anything other than NULL after NOT is an error. */
1979 return SQLITE_MISUSE;
1980 }
1981 }
1982
1983 /* Anything else is an error. */
1984 if( findWord(zEnd, &zDummy, &zDummy) ){
1985 return SQLITE_MISUSE;
1986 }
1987
1988 *pTypeMask = kTypeInfo[i].strictMask;
1989 if( !bStrict ){
1990 *pTypeMask |= kTypeInfo[i].otherMask;
1991 }
1992 if( bNotNull ){
1993 *pTypeMask &= ~MASK_NULL;
1994 }
1995 if( kTypeInfo[i].zReplace ){
1996 *pzTypeStart = kTypeInfo[i].zReplace;
1997 *pzTypeEnd = *pzTypeStart + strlen(*pzTypeStart);
1998 }
1999 return SQLITE_OK;
2000 }
2001
2002 /* Parse the arguments, placing type masks in *pTypes and the exposed
2003 * schema in *pzCreateSql (for sqlite3_declare_vtab).
2004 */
ParseColumnsAndGenerateCreate(unsigned nCols,const char * const * pCols,char ** pzCreateSql,unsigned char * pTypes,char ** pzErr)2005 static int ParseColumnsAndGenerateCreate(unsigned nCols,
2006 const char *const *pCols,
2007 char **pzCreateSql,
2008 unsigned char *pTypes,
2009 char **pzErr){
2010 unsigned i;
2011 char *zCreateSql = sqlite3_mprintf("CREATE TABLE x(");
2012 if( !zCreateSql ){
2013 return SQLITE_NOMEM;
2014 }
2015
2016 for( i=0; i<nCols; i++ ){
2017 const char *zSep = (i < nCols - 1 ? ", " : ")");
2018 const char *zNotNull = "";
2019 const char *zNameStart, *zNameEnd;
2020 const char *zTypeStart, *zTypeEnd;
2021 int rc = findNameAndType(pCols[i],
2022 &zNameStart, &zNameEnd,
2023 &zTypeStart, &zTypeEnd,
2024 &pTypes[i]);
2025 if( rc!=SQLITE_OK ){
2026 *pzErr = sqlite3_mprintf("unable to parse column %d", i);
2027 sqlite3_free(zCreateSql);
2028 return rc;
2029 }
2030
2031 if( !(pTypes[i]&MASK_NULL) ){
2032 zNotNull = " NOT NULL";
2033 }
2034
2035 /* Add name and type to the create statement. */
2036 zCreateSql = sqlite3_mprintf("%z%.*s %.*s%s%s",
2037 zCreateSql,
2038 zNameEnd - zNameStart, zNameStart,
2039 zTypeEnd - zTypeStart, zTypeStart,
2040 zNotNull, zSep);
2041 if( !zCreateSql ){
2042 return SQLITE_NOMEM;
2043 }
2044 }
2045
2046 *pzCreateSql = zCreateSql;
2047 return SQLITE_OK;
2048 }
2049
2050 /* Helper function for initializing the module. */
2051 /* argv[0] module name
2052 * argv[1] db name for virtual table
2053 * argv[2] virtual table name
2054 * argv[3] backing table name
2055 * argv[4] columns
2056 */
2057 /* TODO(shess): Since connect isn't supported, could inline into
2058 * recoverCreate().
2059 */
2060 /* TODO(shess): Explore cases where it would make sense to set *pzErr. */
recoverInit(sqlite3 * db,void * pAux,int argc,const char * const * argv,sqlite3_vtab ** ppVtab,char ** pzErr)2061 static int recoverInit(
2062 sqlite3 *db, /* Database connection */
2063 void *pAux, /* unused */
2064 int argc, const char *const*argv, /* Parameters to CREATE TABLE statement */
2065 sqlite3_vtab **ppVtab, /* OUT: New virtual table */
2066 char **pzErr /* OUT: Error message, if any */
2067 ){
2068 const unsigned kTypeCol = 4; /* First argument with column type info. */
2069 Recover *pRecover; /* Virtual table structure being created. */
2070 char *zDot; /* Any dot found in "db.table" backing. */
2071 u32 iRootPage; /* Root page of backing table. */
2072 char *zCreateSql; /* Schema of created virtual table. */
2073 int rc;
2074
2075 /* Require to be in the temp database. */
2076 if( ascii_strcasecmp(argv[1], "temp")!=0 ){
2077 *pzErr = sqlite3_mprintf("recover table must be in temp database");
2078 return SQLITE_MISUSE;
2079 }
2080
2081 /* Need the backing table and at least one column. */
2082 if( argc<=kTypeCol ){
2083 *pzErr = sqlite3_mprintf("no columns specified");
2084 return SQLITE_MISUSE;
2085 }
2086
2087 pRecover = sqlite3_malloc(sizeof(Recover));
2088 if( !pRecover ){
2089 return SQLITE_NOMEM;
2090 }
2091 memset(pRecover, 0, sizeof(*pRecover));
2092 pRecover->base.pModule = &recoverModule;
2093 pRecover->db = db;
2094
2095 /* Parse out db.table, assuming main if no dot. */
2096 zDot = strchr(argv[3], '.');
2097 if( !zDot ){
2098 pRecover->zDb = sqlite3_strdup(db->aDb[0].zName);
2099 pRecover->zTable = sqlite3_strdup(argv[3]);
2100 }else if( zDot>argv[3] && zDot[1]!='\0' ){
2101 pRecover->zDb = sqlite3_strndup(argv[3], zDot - argv[3]);
2102 pRecover->zTable = sqlite3_strdup(zDot + 1);
2103 }else{
2104 /* ".table" or "db." not allowed. */
2105 *pzErr = sqlite3_mprintf("ill-formed table specifier");
2106 recoverRelease(pRecover);
2107 return SQLITE_ERROR;
2108 }
2109
2110 pRecover->nCols = argc - kTypeCol;
2111 pRecover->pTypes = sqlite3_malloc(pRecover->nCols);
2112 if( !pRecover->zDb || !pRecover->zTable || !pRecover->pTypes ){
2113 recoverRelease(pRecover);
2114 return SQLITE_NOMEM;
2115 }
2116
2117 /* Require the backing table to exist. */
2118 /* TODO(shess): Be more pedantic about the form of the descriptor
2119 * string. This already fails for poorly-formed strings, simply
2120 * because there won't be a root page, but it would make more sense
2121 * to be explicit.
2122 */
2123 rc = getRootPage(pRecover->db, pRecover->zDb, pRecover->zTable, &iRootPage);
2124 if( rc!=SQLITE_OK ){
2125 *pzErr = sqlite3_mprintf("unable to find backing table");
2126 recoverRelease(pRecover);
2127 return rc;
2128 }
2129
2130 /* Parse the column definitions. */
2131 rc = ParseColumnsAndGenerateCreate(pRecover->nCols, argv + kTypeCol,
2132 &zCreateSql, pRecover->pTypes, pzErr);
2133 if( rc!=SQLITE_OK ){
2134 recoverRelease(pRecover);
2135 return rc;
2136 }
2137
2138 rc = sqlite3_declare_vtab(db, zCreateSql);
2139 sqlite3_free(zCreateSql);
2140 if( rc!=SQLITE_OK ){
2141 recoverRelease(pRecover);
2142 return rc;
2143 }
2144
2145 *ppVtab = (sqlite3_vtab *)pRecover;
2146 return SQLITE_OK;
2147 }
2148