• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1Add new virtual table 'recover' to src/ and the amalgamation.
2
3Since recover.c is in somewhat active development, it is possible that
4the patch below will not reliably re-create the file.
5
6shess@chromium.org
7
8Generated with:
9git diff --cached --relative=third_party/sqlite/src --src-prefix='' --dst-prefix='' > third_party/sqlite/recover.patch
10[--cached because otherwise the diff adding recover.c wasn't generated.]
11
12diff --git Makefile.in Makefile.in
13index f3239f3..216742c 100644
14--- Makefile.in
15+++ Makefile.in
16@@ -251,6 +251,7 @@ SRC = \
17   $(TOP)/src/prepare.c \
18   $(TOP)/src/printf.c \
19   $(TOP)/src/random.c \
20+  $(TOP)/src/recover.c \
21   $(TOP)/src/resolve.c \
22   $(TOP)/src/rowset.c \
23   $(TOP)/src/select.c \
24diff --git src/sqlite.h.in src/sqlite.h.in
25index 62b9326..fb76659 100644
26--- src/sqlite.h.in
27+++ src/sqlite.h.in
28@@ -6403,6 +6403,17 @@ int sqlite3_wal_checkpoint_v2(
29 #define SQLITE_CHECKPOINT_RESTART 2
30
31
32+/* Begin recover.patch for Chromium */
33+/*
34+** Call to initialize the recover virtual-table modules (see recover.c).
35+**
36+** This could be loaded by default in main.c, but that would make the
37+** virtual table available to Web SQL.  Breaking it out allows only
38+** selected users to enable it (currently sql/recovery.cc).
39+*/
40+int recoverVtableInit(sqlite3 *db);
41+/* End recover.patch for Chromium */
42+
43 /*
44 ** Undo the hack that converts floating point types to integer for
45 ** builds on processors without floating point support.
46diff --git tool/mksqlite3c.tcl tool/mksqlite3c.tcl
47index fa99f2d..df2df07 100644
48--- tool/mksqlite3c.tcl
49+++ tool/mksqlite3c.tcl
50@@ -293,6 +293,8 @@ foreach file {
51    main.c
52    notify.c
53
54+   recover.c
55+
56    fts3.c
57    fts3_aux.c
58    fts3_expr.c
59diff --git src/recover.c src/recover.c
60new file mode 100644
61index 0000000..6430c8b
62--- /dev/null
63+++ src/recover.c
64@@ -0,0 +1,2130 @@
65+/*
66+** 2012 Jan 11
67+**
68+** The author disclaims copyright to this source code.  In place of
69+** a legal notice, here is a blessing:
70+**
71+**    May you do good and not evil.
72+**    May you find forgiveness for yourself and forgive others.
73+**    May you share freely, never taking more than you give.
74+*/
75+/* TODO(shess): THIS MODULE IS STILL EXPERIMENTAL.  DO NOT USE IT. */
76+/* Implements a virtual table "recover" which can be used to recover
77+ * data from a corrupt table.  The table is walked manually, with
78+ * corrupt items skipped.  Additionally, any errors while reading will
79+ * be skipped.
80+ *
81+ * Given a table with this definition:
82+ *
83+ * CREATE TABLE Stuff (
84+ *   name TEXT PRIMARY KEY,
85+ *   value TEXT NOT NULL
86+ * );
87+ *
88+ * to recover the data from teh table, you could do something like:
89+ *
90+ * -- Attach another database, the original is not trustworthy.
91+ * ATTACH DATABASE '/tmp/db.db' AS rdb;
92+ * -- Create a new version of the table.
93+ * CREATE TABLE rdb.Stuff (
94+ *   name TEXT PRIMARY KEY,
95+ *   value TEXT NOT NULL
96+ * );
97+ * -- This will read the original table's data.
98+ * CREATE VIRTUAL TABLE temp.recover_Stuff using recover(
99+ *   main.Stuff,
100+ *   name TEXT STRICT NOT NULL,  -- only real TEXT data allowed
101+ *   value TEXT STRICT NOT NULL
102+ * );
103+ * -- Corruption means the UNIQUE constraint may no longer hold for
104+ * -- Stuff, so either OR REPLACE or OR IGNORE must be used.
105+ * INSERT OR REPLACE INTO rdb.Stuff (rowid, name, value )
106+ *   SELECT rowid, name, value FROM temp.recover_Stuff;
107+ * DROP TABLE temp.recover_Stuff;
108+ * DETACH DATABASE rdb;
109+ * -- Move db.db to replace original db in filesystem.
110+ *
111+ *
112+ * Usage
113+ *
114+ * Given the goal of dealing with corruption, it would not be safe to
115+ * create a recovery table in the database being recovered.  So
116+ * recovery tables must be created in the temp database.  They are not
117+ * appropriate to persist, in any case.  [As a bonus, sqlite_master
118+ * tables can be recovered.  Perhaps more cute than useful, though.]
119+ *
120+ * The parameters are a specifier for the table to read, and a column
121+ * definition for each bit of data stored in that table.  The named
122+ * table must be convertable to a root page number by reading the
123+ * sqlite_master table.  Bare table names are assumed to be in
124+ * database 0 ("main"), other databases can be specified in db.table
125+ * fashion.
126+ *
127+ * Column definitions are similar to BUT NOT THE SAME AS those
128+ * provided to CREATE statements:
129+ *  column-def: column-name [type-name [STRICT] [NOT NULL]]
130+ *  type-name: (ANY|ROWID|INTEGER|FLOAT|NUMERIC|TEXT|BLOB)
131+ *
132+ * Only those exact type names are accepted, there is no type
133+ * intuition.  The only constraints accepted are STRICT (see below)
134+ * and NOT NULL.  Anything unexpected will cause the create to fail.
135+ *
136+ * ANY is a convenience to indicate that manifest typing is desired.
137+ * It is equivalent to not specifying a type at all.  The results for
138+ * such columns will have the type of the data's storage.  The exposed
139+ * schema will contain no type for that column.
140+ *
141+ * ROWID is used for columns representing aliases to the rowid
142+ * (INTEGER PRIMARY KEY, with or without AUTOINCREMENT), to make the
143+ * concept explicit.  Such columns are actually stored as NULL, so
144+ * they cannot be simply ignored.  The exposed schema will be INTEGER
145+ * for that column.
146+ *
147+ * NOT NULL causes rows with a NULL in that column to be skipped.  It
148+ * also adds NOT NULL to the column in the exposed schema.  If the
149+ * table has ever had columns added using ALTER TABLE, then those
150+ * columns implicitly contain NULL for rows which have not been
151+ * updated.  [Workaround using COALESCE() in your SELECT statement.]
152+ *
153+ * The created table is read-only, with no indices.  Any SELECT will
154+ * be a full-table scan, returning each valid row read from the
155+ * storage of the backing table.  The rowid will be the rowid of the
156+ * row from the backing table.  "Valid" means:
157+ * - The cell metadata for the row is well-formed.  Mainly this means that
158+ *   the cell header info describes a payload of the size indicated by
159+ *   the cell's payload size.
160+ * - The cell does not run off the page.
161+ * - The cell does not overlap any other cell on the page.
162+ * - The cell contains doesn't contain too many columns.
163+ * - The types of the serialized data match the indicated types (see below).
164+ *
165+ *
166+ * Type affinity versus type storage.
167+ *
168+ * http://www.sqlite.org/datatype3.html describes SQLite's type
169+ * affinity system.  The system provides for automated coercion of
170+ * types in certain cases, transparently enough that many developers
171+ * do not realize that it is happening.  Importantly, it implies that
172+ * the raw data stored in the database may not have the obvious type.
173+ *
174+ * Differences between the stored data types and the expected data
175+ * types may be a signal of corruption.  This module makes some
176+ * allowances for automatic coercion.  It is important to be concious
177+ * of the difference between the schema exposed by the module, and the
178+ * data types read from storage.  The following table describes how
179+ * the module interprets things:
180+ *
181+ * type     schema   data                     STRICT
182+ * ----     ------   ----                     ------
183+ * ANY      <none>   any                      any
184+ * ROWID    INTEGER  n/a                      n/a
185+ * INTEGER  INTEGER  integer                  integer
186+ * FLOAT    FLOAT    integer or float         float
187+ * NUMERIC  NUMERIC  integer, float, or text  integer or float
188+ * TEXT     TEXT     text or blob             text
189+ * BLOB     BLOB     blob                     blob
190+ *
191+ * type is the type provided to the recover module, schema is the
192+ * schema exposed by the module, data is the acceptable types of data
193+ * decoded from storage, and STRICT is a modification of that.
194+ *
195+ * A very loose recovery system might use ANY for all columns, then
196+ * use the appropriate sqlite3_column_*() calls to coerce to expected
197+ * types.  This doesn't provide much protection if a page from a
198+ * different table with the same column count is linked into an
199+ * inappropriate btree.
200+ *
201+ * A very tight recovery system might use STRICT to enforce typing on
202+ * all columns, preferring to skip rows which are valid at the storage
203+ * level but don't contain the right types.  Note that FLOAT STRICT is
204+ * almost certainly not appropriate, since integral values are
205+ * transparently stored as integers, when that is more efficient.
206+ *
207+ * Another option is to use ANY for all columns and inspect each
208+ * result manually (using sqlite3_column_*).  This should only be
209+ * necessary in cases where developers have used manifest typing (test
210+ * to make sure before you decide that you aren't using manifest
211+ * typing!).
212+ *
213+ *
214+ * Caveats
215+ *
216+ * Leaf pages not referenced by interior nodes will not be found.
217+ *
218+ * Leaf pages referenced from interior nodes of other tables will not
219+ * be resolved.
220+ *
221+ * Rows referencing invalid overflow pages will be skipped.
222+ *
223+ * SQlite rows have a header which describes how to interpret the rest
224+ * of the payload.  The header can be valid in cases where the rest of
225+ * the record is actually corrupt (in the sense that the data is not
226+ * the intended data).  This can especially happen WRT overflow pages,
227+ * as lack of atomic updates between pages is the primary form of
228+ * corruption I have seen in the wild.
229+ */
230+/* The implementation is via a series of cursors.  The cursor
231+ * implementations follow the pattern:
232+ *
233+ * // Creates the cursor using various initialization info.
234+ * int cursorCreate(...);
235+ *
236+ * // Returns 1 if there is no more data, 0 otherwise.
237+ * int cursorEOF(Cursor *pCursor);
238+ *
239+ * // Various accessors can be used if not at EOF.
240+ *
241+ * // Move to the next item.
242+ * int cursorNext(Cursor *pCursor);
243+ *
244+ * // Destroy the memory associated with the cursor.
245+ * void cursorDestroy(Cursor *pCursor);
246+ *
247+ * References in the following are to sections at
248+ * http://www.sqlite.org/fileformat2.html .
249+ *
250+ * RecoverLeafCursor iterates the records in a leaf table node
251+ * described in section 1.5 "B-tree Pages".  When the node is
252+ * exhausted, an interior cursor is used to get the next leaf node,
253+ * and iteration continues there.
254+ *
255+ * RecoverInteriorCursor iterates the child pages in an interior table
256+ * node described in section 1.5 "B-tree Pages".  When the node is
257+ * exhausted, a parent interior cursor is used to get the next
258+ * interior node at the same level, and iteration continues there.
259+ *
260+ * Together these record the path from the leaf level to the root of
261+ * the tree.  Iteration happens from the leaves rather than the root
262+ * both for efficiency and putting the special case at the front of
263+ * the list is easier to implement.
264+ *
265+ * RecoverCursor uses a RecoverLeafCursor to iterate the rows of a
266+ * table, returning results via the SQLite virtual table interface.
267+ */
268+/* TODO(shess): It might be useful to allow DEFAULT in types to
269+ * specify what to do for NULL when an ALTER TABLE case comes up.
270+ * Unfortunately, simply adding it to the exposed schema and using
271+ * sqlite3_result_null() does not cause the default to be generate.
272+ * Handling it ourselves seems hard, unfortunately.
273+ */
274+
275+#include <assert.h>
276+#include <ctype.h>
277+#include <stdio.h>
278+#include <string.h>
279+
280+/* Internal SQLite things that are used:
281+ * u32, u64, i64 types.
282+ * Btree, Pager, and DbPage structs.
283+ * DbPage.pData, .pPager, and .pgno
284+ * sqlite3 struct.
285+ * sqlite3BtreePager() and sqlite3BtreeGetPageSize()
286+ * sqlite3PagerAcquire() and sqlite3PagerUnref()
287+ * getVarint().
288+ */
289+#include "sqliteInt.h"
290+
291+/* For debugging. */
292+#if 0
293+#define FNENTRY() fprintf(stderr, "In %s\n", __FUNCTION__)
294+#else
295+#define FNENTRY()
296+#endif
297+
298+/* Generic constants and helper functions. */
299+
300+static const unsigned char kTableLeafPage = 0x0D;
301+static const unsigned char kTableInteriorPage = 0x05;
302+
303+/* From section 1.5. */
304+static const unsigned kiPageTypeOffset = 0;
305+static const unsigned kiPageFreeBlockOffset = 1;
306+static const unsigned kiPageCellCountOffset = 3;
307+static const unsigned kiPageCellContentOffset = 5;
308+static const unsigned kiPageFragmentedBytesOffset = 7;
309+static const unsigned knPageLeafHeaderBytes = 8;
310+/* Interior pages contain an additional field. */
311+static const unsigned kiPageRightChildOffset = 8;
312+static const unsigned kiPageInteriorHeaderBytes = 12;
313+
314+/* Accepted types are specified by a mask. */
315+#define MASK_ROWID (1<<0)
316+#define MASK_INTEGER (1<<1)
317+#define MASK_FLOAT (1<<2)
318+#define MASK_TEXT (1<<3)
319+#define MASK_BLOB (1<<4)
320+#define MASK_NULL (1<<5)
321+
322+/* Helpers to decode fixed-size fields. */
323+static u32 decodeUnsigned16(const unsigned char *pData){
324+  return (pData[0]<<8) + pData[1];
325+}
326+static u32 decodeUnsigned32(const unsigned char *pData){
327+  return (decodeUnsigned16(pData)<<16) + decodeUnsigned16(pData+2);
328+}
329+static i64 decodeSigned(const unsigned char *pData, unsigned nBytes){
330+  i64 r = (char)(*pData);
331+  while( --nBytes ){
332+    r <<= 8;
333+    r += *(++pData);
334+  }
335+  return r;
336+}
337+/* Derived from vdbeaux.c, sqlite3VdbeSerialGet(), case 7. */
338+/* TODO(shess): Determine if swapMixedEndianFloat() applies. */
339+static double decodeFloat64(const unsigned char *pData){
340+#if !defined(NDEBUG)
341+  static const u64 t1 = ((u64)0x3ff00000)<<32;
342+  static const double r1 = 1.0;
343+  u64 t2 = t1;
344+  assert( sizeof(r1)==sizeof(t2) && memcmp(&r1, &t2, sizeof(r1))==0 );
345+#endif
346+  i64 x = decodeSigned(pData, 8);
347+  double d;
348+  memcpy(&d, &x, sizeof(x));
349+  return d;
350+}
351+
352+/* Return true if a varint can safely be read from pData/nData. */
353+/* TODO(shess): DbPage points into the middle of a buffer which
354+ * contains the page data before DbPage.  So code should always be
355+ * able to read a small number of varints safely.  Consider whether to
356+ * trust that or not.
357+ */
358+static int checkVarint(const unsigned char *pData, unsigned nData){
359+  unsigned i;
360+
361+  /* In the worst case the decoder takes all 8 bits of the 9th byte. */
362+  if( nData>=9 ){
363+    return 1;
364+  }
365+
366+  /* Look for a high-bit-clear byte in what's left. */
367+  for( i=0; i<nData; ++i ){
368+    if( !(pData[i]&0x80) ){
369+      return 1;
370+    }
371+  }
372+
373+  /* Cannot decode in the space given. */
374+  return 0;
375+}
376+
377+/* Return 1 if n varints can be read from pData/nData. */
378+static int checkVarints(const unsigned char *pData, unsigned nData,
379+                        unsigned n){
380+  unsigned nCur = 0;   /* Byte offset within current varint. */
381+  unsigned nFound = 0; /* Number of varints found. */
382+  unsigned i;
383+
384+  /* In the worst case the decoder takes all 8 bits of the 9th byte. */
385+  if( nData>=9*n ){
386+    return 1;
387+  }
388+
389+  for( i=0; nFound<n && i<nData; ++i ){
390+    nCur++;
391+    if( nCur==9 || !(pData[i]&0x80) ){
392+      nFound++;
393+      nCur = 0;
394+    }
395+  }
396+
397+  return nFound==n;
398+}
399+
400+/* ctype and str[n]casecmp() can be affected by locale (eg, tr_TR).
401+ * These versions consider only the ASCII space.
402+ */
403+/* TODO(shess): It may be reasonable to just remove the need for these
404+ * entirely.  The module could require "TEXT STRICT NOT NULL", not
405+ * "Text Strict Not Null" or whatever the developer felt like typing
406+ * that day.  Handling corrupt data is a PERFECT place to be pedantic.
407+ */
408+static int ascii_isspace(char c){
409+  /* From fts3_expr.c */
410+  return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
411+}
412+static int ascii_isalnum(int x){
413+  /* From fts3_tokenizer1.c */
414+  return (x>='0' && x<='9') || (x>='A' && x<='Z') || (x>='a' && x<='z');
415+}
416+static int ascii_tolower(int x){
417+  /* From fts3_tokenizer1.c */
418+  return (x>='A' && x<='Z') ? x-'A'+'a' : x;
419+}
420+/* TODO(shess): Consider sqlite3_strnicmp() */
421+static int ascii_strncasecmp(const char *s1, const char *s2, size_t n){
422+  const unsigned char *us1 = (const unsigned char *)s1;
423+  const unsigned char *us2 = (const unsigned char *)s2;
424+  while( *us1 && *us2 && n && ascii_tolower(*us1)==ascii_tolower(*us2) ){
425+    us1++, us2++, n--;
426+  }
427+  return n ? ascii_tolower(*us1)-ascii_tolower(*us2) : 0;
428+}
429+static int ascii_strcasecmp(const char *s1, const char *s2){
430+  /* If s2 is equal through strlen(s1), will exit while() due to s1's
431+   * trailing NUL, and return NUL-s2[strlen(s1)].
432+   */
433+  return ascii_strncasecmp(s1, s2, strlen(s1)+1);
434+}
435+
436+/* For some reason I kept making mistakes with offset calculations. */
437+static const unsigned char *PageData(DbPage *pPage, unsigned iOffset){
438+  assert( iOffset<=pPage->nPageSize );
439+  return (unsigned char *)pPage->pData + iOffset;
440+}
441+
442+/* The first page in the file contains a file header in the first 100
443+ * bytes.  The page's header information comes after that.  Note that
444+ * the offsets in the page's header information are relative to the
445+ * beginning of the page, NOT the end of the page header.
446+ */
447+static const unsigned char *PageHeader(DbPage *pPage){
448+  if( pPage->pgno==1 ){
449+    const unsigned nDatabaseHeader = 100;
450+    return PageData(pPage, nDatabaseHeader);
451+  }else{
452+    return PageData(pPage, 0);
453+  }
454+}
455+
456+/* Helper to fetch the pager and page size for the named database. */
457+static int GetPager(sqlite3 *db, const char *zName,
458+                    Pager **pPager, unsigned *pnPageSize){
459+  Btree *pBt = NULL;
460+  int i;
461+  for( i=0; i<db->nDb; ++i ){
462+    if( ascii_strcasecmp(db->aDb[i].zName, zName)==0 ){
463+      pBt = db->aDb[i].pBt;
464+      break;
465+    }
466+  }
467+  if( !pBt ){
468+    return SQLITE_ERROR;
469+  }
470+
471+  *pPager = sqlite3BtreePager(pBt);
472+  *pnPageSize = sqlite3BtreeGetPageSize(pBt) - sqlite3BtreeGetReserve(pBt);
473+  return SQLITE_OK;
474+}
475+
476+/* iSerialType is a type read from a record header.  See "2.1 Record Format".
477+ */
478+
479+/* Storage size of iSerialType in bytes.  My interpretation of SQLite
480+ * documentation is that text and blob fields can have 32-bit length.
481+ * Values past 2^31-12 will need more than 32 bits to encode, which is
482+ * why iSerialType is u64.
483+ */
484+static u32 SerialTypeLength(u64 iSerialType){
485+  switch( iSerialType ){
486+    case 0 : return 0;  /* NULL */
487+    case 1 : return 1;  /* Various integers. */
488+    case 2 : return 2;
489+    case 3 : return 3;
490+    case 4 : return 4;
491+    case 5 : return 6;
492+    case 6 : return 8;
493+    case 7 : return 8;  /* 64-bit float. */
494+    case 8 : return 0;  /* Constant 0. */
495+    case 9 : return 0;  /* Constant 1. */
496+    case 10 : case 11 : assert( !"RESERVED TYPE"); return 0;
497+  }
498+  return (u32)((iSerialType>>1) - 6);
499+}
500+
501+/* True if iSerialType refers to a blob. */
502+static int SerialTypeIsBlob(u64 iSerialType){
503+  assert( iSerialType>=12 );
504+  return (iSerialType%2)==0;
505+}
506+
507+/* Returns true if the serialized type represented by iSerialType is
508+ * compatible with the given type mask.
509+ */
510+static int SerialTypeIsCompatible(u64 iSerialType, unsigned char mask){
511+  switch( iSerialType ){
512+    case 0  : return (mask&MASK_NULL)!=0;
513+    case 1  : return (mask&MASK_INTEGER)!=0;
514+    case 2  : return (mask&MASK_INTEGER)!=0;
515+    case 3  : return (mask&MASK_INTEGER)!=0;
516+    case 4  : return (mask&MASK_INTEGER)!=0;
517+    case 5  : return (mask&MASK_INTEGER)!=0;
518+    case 6  : return (mask&MASK_INTEGER)!=0;
519+    case 7  : return (mask&MASK_FLOAT)!=0;
520+    case 8  : return (mask&MASK_INTEGER)!=0;
521+    case 9  : return (mask&MASK_INTEGER)!=0;
522+    case 10 : assert( !"RESERVED TYPE"); return 0;
523+    case 11 : assert( !"RESERVED TYPE"); return 0;
524+  }
525+  return (mask&(SerialTypeIsBlob(iSerialType) ? MASK_BLOB : MASK_TEXT));
526+}
527+
528+/* Versions of strdup() with return values appropriate for
529+ * sqlite3_free().  malloc.c has sqlite3DbStrDup()/NDup(), but those
530+ * need sqlite3DbFree(), which seems intrusive.
531+ */
532+static char *sqlite3_strndup(const char *z, unsigned n){
533+  char *zNew;
534+
535+  if( z==NULL ){
536+    return NULL;
537+  }
538+
539+  zNew = sqlite3_malloc(n+1);
540+  if( zNew!=NULL ){
541+    memcpy(zNew, z, n);
542+    zNew[n] = '\0';
543+  }
544+  return zNew;
545+}
546+static char *sqlite3_strdup(const char *z){
547+  if( z==NULL ){
548+    return NULL;
549+  }
550+  return sqlite3_strndup(z, strlen(z));
551+}
552+
553+/* Fetch the page number of zTable in zDb from sqlite_master in zDb,
554+ * and put it in *piRootPage.
555+ */
556+static int getRootPage(sqlite3 *db, const char *zDb, const char *zTable,
557+                       u32 *piRootPage){
558+  char *zSql;  /* SQL selecting root page of named element. */
559+  sqlite3_stmt *pStmt;
560+  int rc;
561+
562+  if( strcmp(zTable, "sqlite_master")==0 ){
563+    *piRootPage = 1;
564+    return SQLITE_OK;
565+  }
566+
567+  zSql = sqlite3_mprintf("SELECT rootpage FROM %s.sqlite_master "
568+                         "WHERE type = 'table' AND tbl_name = %Q",
569+                         zDb, zTable);
570+  if( !zSql ){
571+    return SQLITE_NOMEM;
572+  }
573+
574+  rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
575+  sqlite3_free(zSql);
576+  if( rc!=SQLITE_OK ){
577+    return rc;
578+  }
579+
580+  /* Require a result. */
581+  rc = sqlite3_step(pStmt);
582+  if( rc==SQLITE_DONE ){
583+    rc = SQLITE_CORRUPT;
584+  }else if( rc==SQLITE_ROW ){
585+    *piRootPage = sqlite3_column_int(pStmt, 0);
586+
587+    /* Require only one result. */
588+    rc = sqlite3_step(pStmt);
589+    if( rc==SQLITE_DONE ){
590+      rc = SQLITE_OK;
591+    }else if( rc==SQLITE_ROW ){
592+      rc = SQLITE_CORRUPT;
593+    }
594+  }
595+  sqlite3_finalize(pStmt);
596+  return rc;
597+}
598+
599+static int getEncoding(sqlite3 *db, const char *zDb, int* piEncoding){
600+  sqlite3_stmt *pStmt;
601+  int rc;
602+  char *zSql = sqlite3_mprintf("PRAGMA %s.encoding", zDb);
603+  if( !zSql ){
604+    return SQLITE_NOMEM;
605+  }
606+
607+  rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
608+  sqlite3_free(zSql);
609+  if( rc!=SQLITE_OK ){
610+    return rc;
611+  }
612+
613+  /* Require a result. */
614+  rc = sqlite3_step(pStmt);
615+  if( rc==SQLITE_DONE ){
616+    /* This case should not be possible. */
617+    rc = SQLITE_CORRUPT;
618+  }else if( rc==SQLITE_ROW ){
619+    if( sqlite3_column_type(pStmt, 0)==SQLITE_TEXT ){
620+      const char* z = (const char *)sqlite3_column_text(pStmt, 0);
621+      /* These strings match the literals in pragma.c. */
622+      if( !strcmp(z, "UTF-16le") ){
623+        *piEncoding = SQLITE_UTF16LE;
624+      }else if( !strcmp(z, "UTF-16be") ){
625+        *piEncoding = SQLITE_UTF16BE;
626+      }else if( !strcmp(z, "UTF-8") ){
627+        *piEncoding = SQLITE_UTF8;
628+      }else{
629+        /* This case should not be possible. */
630+        *piEncoding = SQLITE_UTF8;
631+      }
632+    }else{
633+      /* This case should not be possible. */
634+      *piEncoding = SQLITE_UTF8;
635+    }
636+
637+    /* Require only one result. */
638+    rc = sqlite3_step(pStmt);
639+    if( rc==SQLITE_DONE ){
640+      rc = SQLITE_OK;
641+    }else if( rc==SQLITE_ROW ){
642+      /* This case should not be possible. */
643+      rc = SQLITE_CORRUPT;
644+    }
645+  }
646+  sqlite3_finalize(pStmt);
647+  return rc;
648+}
649+
650+/* Cursor for iterating interior nodes.  Interior page cells contain a
651+ * child page number and a rowid.  The child page contains items left
652+ * of the rowid (less than).  The rightmost page of the subtree is
653+ * stored in the page header.
654+ *
655+ * interiorCursorDestroy - release all resources associated with the
656+ *                         cursor and any parent cursors.
657+ * interiorCursorCreate - create a cursor with the given parent and page.
658+ * interiorCursorEOF - returns true if neither the cursor nor the
659+ *                     parent cursors can return any more data.
660+ * interiorCursorNextPage - fetch the next child page from the cursor.
661+ *
662+ * Logically, interiorCursorNextPage() returns the next child page
663+ * number from the page the cursor is currently reading, calling the
664+ * parent cursor as necessary to get new pages to read, until done.
665+ * SQLITE_ROW if a page is returned, SQLITE_DONE if out of pages,
666+ * error otherwise.  Unfortunately, if the table is corrupted
667+ * unexpected pages can be returned.  If any unexpected page is found,
668+ * leaf or otherwise, it is returned to the caller for processing,
669+ * with the interior cursor left empty.  The next call to
670+ * interiorCursorNextPage() will recurse to the parent cursor until an
671+ * interior page to iterate is returned.
672+ *
673+ * Note that while interiorCursorNextPage() will refuse to follow
674+ * loops, it does not keep track of pages returned for purposes of
675+ * preventing duplication.
676+ *
677+ * Note that interiorCursorEOF() could return false (not at EOF), and
678+ * interiorCursorNextPage() could still return SQLITE_DONE.  This
679+ * could happen if there are more cells to iterate in an interior
680+ * page, but those cells refer to invalid pages.
681+ */
682+typedef struct RecoverInteriorCursor RecoverInteriorCursor;
683+struct RecoverInteriorCursor {
684+  RecoverInteriorCursor *pParent; /* Parent node to this node. */
685+  DbPage *pPage;                  /* Reference to leaf page. */
686+  unsigned nPageSize;             /* Size of page. */
687+  unsigned nChildren;             /* Number of children on the page. */
688+  unsigned iChild;                /* Index of next child to return. */
689+};
690+
691+static void interiorCursorDestroy(RecoverInteriorCursor *pCursor){
692+  /* Destroy all the cursors to the root. */
693+  while( pCursor ){
694+    RecoverInteriorCursor *p = pCursor;
695+    pCursor = pCursor->pParent;
696+
697+    if( p->pPage ){
698+      sqlite3PagerUnref(p->pPage);
699+      p->pPage = NULL;
700+    }
701+
702+    memset(p, 0xA5, sizeof(*p));
703+    sqlite3_free(p);
704+  }
705+}
706+
707+/* Internal helper.  Reset storage in preparation for iterating pPage. */
708+static void interiorCursorSetPage(RecoverInteriorCursor *pCursor,
709+                                  DbPage *pPage){
710+  assert( PageHeader(pPage)[kiPageTypeOffset]==kTableInteriorPage );
711+
712+  if( pCursor->pPage ){
713+    sqlite3PagerUnref(pCursor->pPage);
714+    pCursor->pPage = NULL;
715+  }
716+  pCursor->pPage = pPage;
717+  pCursor->iChild = 0;
718+
719+  /* A child for each cell, plus one in the header. */
720+  /* TODO(shess): Sanity-check the count?  Page header plus per-cell
721+   * cost of 16-bit offset, 32-bit page number, and one varint
722+   * (minimum 1 byte).
723+   */
724+  pCursor->nChildren = decodeUnsigned16(PageHeader(pPage) +
725+                                        kiPageCellCountOffset) + 1;
726+}
727+
728+static int interiorCursorCreate(RecoverInteriorCursor *pParent,
729+                                DbPage *pPage, int nPageSize,
730+                                RecoverInteriorCursor **ppCursor){
731+  RecoverInteriorCursor *pCursor =
732+    sqlite3_malloc(sizeof(RecoverInteriorCursor));
733+  if( !pCursor ){
734+    return SQLITE_NOMEM;
735+  }
736+
737+  memset(pCursor, 0, sizeof(*pCursor));
738+  pCursor->pParent = pParent;
739+  pCursor->nPageSize = nPageSize;
740+  interiorCursorSetPage(pCursor, pPage);
741+  *ppCursor = pCursor;
742+  return SQLITE_OK;
743+}
744+
745+/* Internal helper.  Return the child page number at iChild. */
746+static unsigned interiorCursorChildPage(RecoverInteriorCursor *pCursor){
747+  const unsigned char *pPageHeader;  /* Header of the current page. */
748+  const unsigned char *pCellOffsets; /* Offset to page's cell offsets. */
749+  unsigned iCellOffset;              /* Offset of target cell. */
750+
751+  assert( pCursor->iChild<pCursor->nChildren );
752+
753+  /* Rightmost child is in the header. */
754+  pPageHeader = PageHeader(pCursor->pPage);
755+  if( pCursor->iChild==pCursor->nChildren-1 ){
756+    return decodeUnsigned32(pPageHeader + kiPageRightChildOffset);
757+  }
758+
759+  /* Each cell is a 4-byte integer page number and a varint rowid
760+   * which is greater than the rowid of items in that sub-tree (this
761+   * module ignores ordering). The offset is from the beginning of the
762+   * page, not from the page header.
763+   */
764+  pCellOffsets = pPageHeader + kiPageInteriorHeaderBytes;
765+  iCellOffset = decodeUnsigned16(pCellOffsets + pCursor->iChild*2);
766+  if( iCellOffset<=pCursor->nPageSize-4 ){
767+    return decodeUnsigned32(PageData(pCursor->pPage, iCellOffset));
768+  }
769+
770+  /* TODO(shess): Check for cell overlaps?  Cells require 4 bytes plus
771+   * a varint.  Check could be identical to leaf check (or even a
772+   * shared helper testing for "Cells starting in this range"?).
773+   */
774+
775+  /* If the offset is broken, return an invalid page number. */
776+  return 0;
777+}
778+
779+static int interiorCursorEOF(RecoverInteriorCursor *pCursor){
780+  /* Find a parent with remaining children.  EOF if none found. */
781+  while( pCursor && pCursor->iChild>=pCursor->nChildren ){
782+    pCursor = pCursor->pParent;
783+  }
784+  return pCursor==NULL;
785+}
786+
787+/* Internal helper.  Used to detect if iPage would cause a loop. */
788+static int interiorCursorPageInUse(RecoverInteriorCursor *pCursor,
789+                                   unsigned iPage){
790+  /* Find any parent using the indicated page. */
791+  while( pCursor && pCursor->pPage->pgno!=iPage ){
792+    pCursor = pCursor->pParent;
793+  }
794+  return pCursor!=NULL;
795+}
796+
797+/* Get the next page from the interior cursor at *ppCursor.  Returns
798+ * SQLITE_ROW with the page in *ppPage, or SQLITE_DONE if out of
799+ * pages, or the error SQLite returned.
800+ *
801+ * If the tree is uneven, then when the cursor attempts to get a new
802+ * interior page from the parent cursor, it may get a non-interior
803+ * page.  In that case, the new page is returned, and *ppCursor is
804+ * updated to point to the parent cursor (this cursor is freed).
805+ */
806+/* TODO(shess): I've tried to avoid recursion in most of this code,
807+ * but this case is more challenging because the recursive call is in
808+ * the middle of operation.  One option for converting it without
809+ * adding memory management would be to retain the head pointer and
810+ * use a helper to "back up" as needed.  Another option would be to
811+ * reverse the list during traversal.
812+ */
813+static int interiorCursorNextPage(RecoverInteriorCursor **ppCursor,
814+                                  DbPage **ppPage){
815+  RecoverInteriorCursor *pCursor = *ppCursor;
816+  while( 1 ){
817+    int rc;
818+    const unsigned char *pPageHeader;  /* Header of found page. */
819+
820+    /* Find a valid child page which isn't on the stack. */
821+    while( pCursor->iChild<pCursor->nChildren ){
822+      const unsigned iPage = interiorCursorChildPage(pCursor);
823+      pCursor->iChild++;
824+      if( interiorCursorPageInUse(pCursor, iPage) ){
825+        fprintf(stderr, "Loop detected at %d\n", iPage);
826+      }else{
827+        int rc = sqlite3PagerAcquire(pCursor->pPage->pPager, iPage, ppPage, 0);
828+        if( rc==SQLITE_OK ){
829+          return SQLITE_ROW;
830+        }
831+      }
832+    }
833+
834+    /* This page has no more children.  Get next page from parent. */
835+    if( !pCursor->pParent ){
836+      return SQLITE_DONE;
837+    }
838+    rc = interiorCursorNextPage(&pCursor->pParent, ppPage);
839+    if( rc!=SQLITE_ROW ){
840+      return rc;
841+    }
842+
843+    /* If a non-interior page is received, that either means that the
844+     * tree is uneven, or that a child was re-used (say as an overflow
845+     * page).  Remove this cursor and let the caller handle the page.
846+     */
847+    pPageHeader = PageHeader(*ppPage);
848+    if( pPageHeader[kiPageTypeOffset]!=kTableInteriorPage ){
849+      *ppCursor = pCursor->pParent;
850+      pCursor->pParent = NULL;
851+      interiorCursorDestroy(pCursor);
852+      return SQLITE_ROW;
853+    }
854+
855+    /* Iterate the new page. */
856+    interiorCursorSetPage(pCursor, *ppPage);
857+    *ppPage = NULL;
858+  }
859+
860+  assert(NULL);  /* NOTREACHED() */
861+  return SQLITE_CORRUPT;
862+}
863+
864+/* Large rows are spilled to overflow pages.  The row's main page
865+ * stores the overflow page number after the local payload, with a
866+ * linked list forward from there as necessary.  overflowMaybeCreate()
867+ * and overflowGetSegment() provide an abstraction for accessing such
868+ * data while centralizing the code.
869+ *
870+ * overflowDestroy - releases all resources associated with the structure.
871+ * overflowMaybeCreate - create the overflow structure if it is needed
872+ *                       to represent the given record.  See function comment.
873+ * overflowGetSegment - fetch a segment from the record, accounting
874+ *                      for overflow pages.  Segments which are not
875+ *                      entirely contained with a page are constructed
876+ *                      into a buffer which is returned.  See function comment.
877+ */
878+typedef struct RecoverOverflow RecoverOverflow;
879+struct RecoverOverflow {
880+  RecoverOverflow *pNextOverflow;
881+  DbPage *pPage;
882+  unsigned nPageSize;
883+};
884+
885+static void overflowDestroy(RecoverOverflow *pOverflow){
886+  while( pOverflow ){
887+    RecoverOverflow *p = pOverflow;
888+    pOverflow = p->pNextOverflow;
889+
890+    if( p->pPage ){
891+      sqlite3PagerUnref(p->pPage);
892+      p->pPage = NULL;
893+    }
894+
895+    memset(p, 0xA5, sizeof(*p));
896+    sqlite3_free(p);
897+  }
898+}
899+
900+/* Internal helper.  Used to detect if iPage would cause a loop. */
901+static int overflowPageInUse(RecoverOverflow *pOverflow, unsigned iPage){
902+  while( pOverflow && pOverflow->pPage->pgno!=iPage ){
903+    pOverflow = pOverflow->pNextOverflow;
904+  }
905+  return pOverflow!=NULL;
906+}
907+
908+/* Setup to access an nRecordBytes record beginning at iRecordOffset
909+ * in pPage.  If nRecordBytes can be satisfied entirely from pPage,
910+ * then no overflow pages are needed an *pnLocalRecordBytes is set to
911+ * nRecordBytes.  Otherwise, *ppOverflow is set to the head of a list
912+ * of overflow pages, and *pnLocalRecordBytes is set to the number of
913+ * bytes local to pPage.
914+ *
915+ * overflowGetSegment() will do the right thing regardless of whether
916+ * those values are set to be in-page or not.
917+ */
918+static int overflowMaybeCreate(DbPage *pPage, unsigned nPageSize,
919+                               unsigned iRecordOffset, unsigned nRecordBytes,
920+                               unsigned *pnLocalRecordBytes,
921+                               RecoverOverflow **ppOverflow){
922+  unsigned nLocalRecordBytes;  /* Record bytes in the leaf page. */
923+  unsigned iNextPage;          /* Next page number for record data. */
924+  unsigned nBytes;             /* Maximum record bytes as of current page. */
925+  int rc;
926+  RecoverOverflow *pFirstOverflow;  /* First in linked list of pages. */
927+  RecoverOverflow *pLastOverflow;   /* End of linked list. */
928+
929+  /* Calculations from the "Table B-Tree Leaf Cell" part of section
930+   * 1.5 of http://www.sqlite.org/fileformat2.html .  maxLocal and
931+   * minLocal to match naming in btree.c.
932+   */
933+  const unsigned maxLocal = nPageSize - 35;
934+  const unsigned minLocal = ((nPageSize-12)*32/255)-23;  /* m */
935+
936+  /* Always fit anything smaller than maxLocal. */
937+  if( nRecordBytes<=maxLocal ){
938+    *pnLocalRecordBytes = nRecordBytes;
939+    *ppOverflow = NULL;
940+    return SQLITE_OK;
941+  }
942+
943+  /* Calculate the remainder after accounting for minLocal on the leaf
944+   * page and what packs evenly into overflow pages.  If the remainder
945+   * does not fit into maxLocal, then a partially-full overflow page
946+   * will be required in any case, so store as little as possible locally.
947+   */
948+  nLocalRecordBytes = minLocal+((nRecordBytes-minLocal)%(nPageSize-4));
949+  if( maxLocal<nLocalRecordBytes ){
950+    nLocalRecordBytes = minLocal;
951+  }
952+
953+  /* Don't read off the end of the page. */
954+  if( iRecordOffset+nLocalRecordBytes+4>nPageSize ){
955+    return SQLITE_CORRUPT;
956+  }
957+
958+  /* First overflow page number is after the local bytes. */
959+  iNextPage =
960+      decodeUnsigned32(PageData(pPage, iRecordOffset + nLocalRecordBytes));
961+  nBytes = nLocalRecordBytes;
962+
963+  /* While there are more pages to read, and more bytes are needed,
964+   * get another page.
965+   */
966+  pFirstOverflow = pLastOverflow = NULL;
967+  rc = SQLITE_OK;
968+  while( iNextPage && nBytes<nRecordBytes ){
969+    RecoverOverflow *pOverflow;  /* New overflow page for the list. */
970+
971+    rc = sqlite3PagerAcquire(pPage->pPager, iNextPage, &pPage, 0);
972+    if( rc!=SQLITE_OK ){
973+      break;
974+    }
975+
976+    pOverflow = sqlite3_malloc(sizeof(RecoverOverflow));
977+    if( !pOverflow ){
978+      sqlite3PagerUnref(pPage);
979+      rc = SQLITE_NOMEM;
980+      break;
981+    }
982+    memset(pOverflow, 0, sizeof(*pOverflow));
983+    pOverflow->pPage = pPage;
984+    pOverflow->nPageSize = nPageSize;
985+
986+    if( !pFirstOverflow ){
987+      pFirstOverflow = pOverflow;
988+    }else{
989+      pLastOverflow->pNextOverflow = pOverflow;
990+    }
991+    pLastOverflow = pOverflow;
992+
993+    iNextPage = decodeUnsigned32(pPage->pData);
994+    nBytes += nPageSize-4;
995+
996+    /* Avoid loops. */
997+    if( overflowPageInUse(pFirstOverflow, iNextPage) ){
998+      fprintf(stderr, "Overflow loop detected at %d\n", iNextPage);
999+      rc = SQLITE_CORRUPT;
1000+      break;
1001+    }
1002+  }
1003+
1004+  /* If there were not enough pages, or too many, things are corrupt.
1005+   * Not having enough pages is an obvious problem, all the data
1006+   * cannot be read.  Too many pages means that the contents of the
1007+   * row between the main page and the overflow page(s) is
1008+   * inconsistent (most likely one or more of the overflow pages does
1009+   * not really belong to this row).
1010+   */
1011+  if( rc==SQLITE_OK && (nBytes<nRecordBytes || iNextPage) ){
1012+    rc = SQLITE_CORRUPT;
1013+  }
1014+
1015+  if( rc==SQLITE_OK ){
1016+    *ppOverflow = pFirstOverflow;
1017+    *pnLocalRecordBytes = nLocalRecordBytes;
1018+  }else if( pFirstOverflow ){
1019+    overflowDestroy(pFirstOverflow);
1020+  }
1021+  return rc;
1022+}
1023+
1024+/* Use in concert with overflowMaybeCreate() to efficiently read parts
1025+ * of a potentially-overflowing record.  pPage and iRecordOffset are
1026+ * the values passed into overflowMaybeCreate(), nLocalRecordBytes and
1027+ * pOverflow are the values returned by that call.
1028+ *
1029+ * On SQLITE_OK, *ppBase points to nRequestBytes of data at
1030+ * iRequestOffset within the record.  If the data exists contiguously
1031+ * in a page, a direct pointer is returned, otherwise a buffer from
1032+ * sqlite3_malloc() is returned with the data.  *pbFree is set true if
1033+ * sqlite3_free() should be called on *ppBase.
1034+ */
1035+/* Operation of this function is subtle.  At any time, pPage is the
1036+ * current page, with iRecordOffset and nLocalRecordBytes being record
1037+ * data within pPage, and pOverflow being the overflow page after
1038+ * pPage.  This allows the code to handle both the initial leaf page
1039+ * and overflow pages consistently by adjusting the values
1040+ * appropriately.
1041+ */
1042+static int overflowGetSegment(DbPage *pPage, unsigned iRecordOffset,
1043+                              unsigned nLocalRecordBytes,
1044+                              RecoverOverflow *pOverflow,
1045+                              unsigned iRequestOffset, unsigned nRequestBytes,
1046+                              unsigned char **ppBase, int *pbFree){
1047+  unsigned nBase;         /* Amount of data currently collected. */
1048+  unsigned char *pBase;   /* Buffer to collect record data into. */
1049+
1050+  /* Skip to the page containing the start of the data. */
1051+  while( iRequestOffset>=nLocalRecordBytes && pOverflow ){
1052+    /* Factor out current page's contribution. */
1053+    iRequestOffset -= nLocalRecordBytes;
1054+
1055+    /* Move forward to the next page in the list. */
1056+    pPage = pOverflow->pPage;
1057+    iRecordOffset = 4;
1058+    nLocalRecordBytes = pOverflow->nPageSize - iRecordOffset;
1059+    pOverflow = pOverflow->pNextOverflow;
1060+  }
1061+
1062+  /* If the requested data is entirely within this page, return a
1063+   * pointer into the page.
1064+   */
1065+  if( iRequestOffset+nRequestBytes<=nLocalRecordBytes ){
1066+    /* TODO(shess): "assignment discards qualifiers from pointer target type"
1067+     * Having ppBase be const makes sense, but sqlite3_free() takes non-const.
1068+     */
1069+    *ppBase = (unsigned char *)PageData(pPage, iRecordOffset + iRequestOffset);
1070+    *pbFree = 0;
1071+    return SQLITE_OK;
1072+  }
1073+
1074+  /* The data range would require additional pages. */
1075+  if( !pOverflow ){
1076+    /* Should never happen, the range is outside the nRecordBytes
1077+     * passed to overflowMaybeCreate().
1078+     */
1079+    assert(NULL);  /* NOTREACHED */
1080+    return SQLITE_ERROR;
1081+  }
1082+
1083+  /* Get a buffer to construct into. */
1084+  nBase = 0;
1085+  pBase = sqlite3_malloc(nRequestBytes);
1086+  if( !pBase ){
1087+    return SQLITE_NOMEM;
1088+  }
1089+  while( nBase<nRequestBytes ){
1090+    /* Copy over data present on this page. */
1091+    unsigned nCopyBytes = nRequestBytes - nBase;
1092+    if( nLocalRecordBytes-iRequestOffset<nCopyBytes ){
1093+      nCopyBytes = nLocalRecordBytes - iRequestOffset;
1094+    }
1095+    memcpy(pBase + nBase, PageData(pPage, iRecordOffset + iRequestOffset),
1096+           nCopyBytes);
1097+    nBase += nCopyBytes;
1098+
1099+    if( pOverflow ){
1100+      /* Copy from start of record data in future pages. */
1101+      iRequestOffset = 0;
1102+
1103+      /* Move forward to the next page in the list.  Should match
1104+       * first while() loop.
1105+       */
1106+      pPage = pOverflow->pPage;
1107+      iRecordOffset = 4;
1108+      nLocalRecordBytes = pOverflow->nPageSize - iRecordOffset;
1109+      pOverflow = pOverflow->pNextOverflow;
1110+    }else if( nBase<nRequestBytes ){
1111+      /* Ran out of overflow pages with data left to deliver.  Not
1112+       * possible if the requested range fits within nRecordBytes
1113+       * passed to overflowMaybeCreate() when creating pOverflow.
1114+       */
1115+      assert(NULL);  /* NOTREACHED */
1116+      sqlite3_free(pBase);
1117+      return SQLITE_ERROR;
1118+    }
1119+  }
1120+  assert( nBase==nRequestBytes );
1121+  *ppBase = pBase;
1122+  *pbFree = 1;
1123+  return SQLITE_OK;
1124+}
1125+
1126+/* Primary structure for iterating the contents of a table.
1127+ *
1128+ * leafCursorDestroy - release all resources associated with the cursor.
1129+ * leafCursorCreate - create a cursor to iterate items from tree at
1130+ *                    the provided root page.
1131+ * leafCursorNextValidCell - get the cursor ready to access data from
1132+ *                           the next valid cell in the table.
1133+ * leafCursorCellRowid - get the current cell's rowid.
1134+ * leafCursorCellColumns - get current cell's column count.
1135+ * leafCursorCellColInfo - get type and data for a column in current cell.
1136+ *
1137+ * leafCursorNextValidCell skips cells which fail simple integrity
1138+ * checks, such as overlapping other cells, or being located at
1139+ * impossible offsets, or where header data doesn't correctly describe
1140+ * payload data.  Returns SQLITE_ROW if a valid cell is found,
1141+ * SQLITE_DONE if all pages in the tree were exhausted.
1142+ *
1143+ * leafCursorCellColInfo() accounts for overflow pages in the style of
1144+ * overflowGetSegment().
1145+ */
1146+typedef struct RecoverLeafCursor RecoverLeafCursor;
1147+struct RecoverLeafCursor {
1148+  RecoverInteriorCursor *pParent;  /* Parent node to this node. */
1149+  DbPage *pPage;                   /* Reference to leaf page. */
1150+  unsigned nPageSize;              /* Size of pPage. */
1151+  unsigned nCells;                 /* Number of cells in pPage. */
1152+  unsigned iCell;                  /* Current cell. */
1153+
1154+  /* Info parsed from data in iCell. */
1155+  i64 iRowid;                      /* rowid parsed. */
1156+  unsigned nRecordCols;            /* how many items in the record. */
1157+  u64 iRecordOffset;               /* offset to record data. */
1158+  /* TODO(shess): nRecordBytes and nRecordHeaderBytes are used in
1159+   * leafCursorCellColInfo() to prevent buffer overruns.
1160+   * leafCursorCellDecode() already verified that the cell is valid, so
1161+   * those checks should be redundant.
1162+   */
1163+  u64 nRecordBytes;                /* Size of record data. */
1164+  unsigned nLocalRecordBytes;      /* Amount of record data in-page. */
1165+  unsigned nRecordHeaderBytes;     /* Size of record header data. */
1166+  unsigned char *pRecordHeader;    /* Pointer to record header data. */
1167+  int bFreeRecordHeader;           /* True if record header requires free. */
1168+  RecoverOverflow *pOverflow;      /* Cell overflow info, if needed. */
1169+};
1170+
1171+/* Internal helper shared between next-page and create-cursor.  If
1172+ * pPage is a leaf page, it will be stored in the cursor and state
1173+ * initialized for reading cells.
1174+ *
1175+ * If pPage is an interior page, a new parent cursor is created and
1176+ * injected on the stack.  This is necessary to handle trees with
1177+ * uneven depth, but also is used during initial setup.
1178+ *
1179+ * If pPage is not a table page at all, it is discarded.
1180+ *
1181+ * If SQLITE_OK is returned, the caller no longer owns pPage,
1182+ * otherwise the caller is responsible for discarding it.
1183+ */
1184+static int leafCursorLoadPage(RecoverLeafCursor *pCursor, DbPage *pPage){
1185+  const unsigned char *pPageHeader;  /* Header of *pPage */
1186+
1187+  /* Release the current page. */
1188+  if( pCursor->pPage ){
1189+    sqlite3PagerUnref(pCursor->pPage);
1190+    pCursor->pPage = NULL;
1191+    pCursor->iCell = pCursor->nCells = 0;
1192+  }
1193+
1194+  /* If the page is an unexpected interior node, inject a new stack
1195+   * layer and try again from there.
1196+   */
1197+  pPageHeader = PageHeader(pPage);
1198+  if( pPageHeader[kiPageTypeOffset]==kTableInteriorPage ){
1199+    RecoverInteriorCursor *pParent;
1200+    int rc = interiorCursorCreate(pCursor->pParent, pPage, pCursor->nPageSize,
1201+                                  &pParent);
1202+    if( rc!=SQLITE_OK ){
1203+      return rc;
1204+    }
1205+    pCursor->pParent = pParent;
1206+    return SQLITE_OK;
1207+  }
1208+
1209+  /* Not a leaf page, skip it. */
1210+  if( pPageHeader[kiPageTypeOffset]!=kTableLeafPage ){
1211+    sqlite3PagerUnref(pPage);
1212+    return SQLITE_OK;
1213+  }
1214+
1215+  /* Take ownership of the page and start decoding. */
1216+  pCursor->pPage = pPage;
1217+  pCursor->iCell = 0;
1218+  pCursor->nCells = decodeUnsigned16(pPageHeader + kiPageCellCountOffset);
1219+  return SQLITE_OK;
1220+}
1221+
1222+/* Get the next leaf-level page in the tree.  Returns SQLITE_ROW when
1223+ * a leaf page is found, SQLITE_DONE when no more leaves exist, or any
1224+ * error which occurred.
1225+ */
1226+static int leafCursorNextPage(RecoverLeafCursor *pCursor){
1227+  if( !pCursor->pParent ){
1228+    return SQLITE_DONE;
1229+  }
1230+
1231+  /* Repeatedly load the parent's next child page until a leaf is found. */
1232+  do {
1233+    DbPage *pNextPage;
1234+    int rc = interiorCursorNextPage(&pCursor->pParent, &pNextPage);
1235+    if( rc!=SQLITE_ROW ){
1236+      assert( rc==SQLITE_DONE );
1237+      return rc;
1238+    }
1239+
1240+    rc = leafCursorLoadPage(pCursor, pNextPage);
1241+    if( rc!=SQLITE_OK ){
1242+      sqlite3PagerUnref(pNextPage);
1243+      return rc;
1244+    }
1245+  } while( !pCursor->pPage );
1246+
1247+  return SQLITE_ROW;
1248+}
1249+
1250+static void leafCursorDestroyCellData(RecoverLeafCursor *pCursor){
1251+  if( pCursor->bFreeRecordHeader ){
1252+    sqlite3_free(pCursor->pRecordHeader);
1253+  }
1254+  pCursor->bFreeRecordHeader = 0;
1255+  pCursor->pRecordHeader = NULL;
1256+
1257+  if( pCursor->pOverflow ){
1258+    overflowDestroy(pCursor->pOverflow);
1259+    pCursor->pOverflow = NULL;
1260+  }
1261+}
1262+
1263+static void leafCursorDestroy(RecoverLeafCursor *pCursor){
1264+  leafCursorDestroyCellData(pCursor);
1265+
1266+  if( pCursor->pParent ){
1267+    interiorCursorDestroy(pCursor->pParent);
1268+    pCursor->pParent = NULL;
1269+  }
1270+
1271+  if( pCursor->pPage ){
1272+    sqlite3PagerUnref(pCursor->pPage);
1273+    pCursor->pPage = NULL;
1274+  }
1275+
1276+  memset(pCursor, 0xA5, sizeof(*pCursor));
1277+  sqlite3_free(pCursor);
1278+}
1279+
1280+/* Create a cursor to iterate the rows from the leaf pages of a table
1281+ * rooted at iRootPage.
1282+ */
1283+/* TODO(shess): recoverOpen() calls this to setup the cursor, and I
1284+ * think that recoverFilter() may make a hard assumption that the
1285+ * cursor returned will turn up at least one valid cell.
1286+ *
1287+ * The cases I can think of which break this assumption are:
1288+ * - pPage is a valid leaf page with no valid cells.
1289+ * - pPage is a valid interior page with no valid leaves.
1290+ * - pPage is a valid interior page who's leaves contain no valid cells.
1291+ * - pPage is not a valid leaf or interior page.
1292+ */
1293+static int leafCursorCreate(Pager *pPager, unsigned nPageSize,
1294+                            u32 iRootPage, RecoverLeafCursor **ppCursor){
1295+  DbPage *pPage;               /* Reference to page at iRootPage. */
1296+  RecoverLeafCursor *pCursor;  /* Leaf cursor being constructed. */
1297+  int rc;
1298+
1299+  /* Start out with the root page. */
1300+  rc = sqlite3PagerAcquire(pPager, iRootPage, &pPage, 0);
1301+  if( rc!=SQLITE_OK ){
1302+    return rc;
1303+  }
1304+
1305+  pCursor = sqlite3_malloc(sizeof(RecoverLeafCursor));
1306+  if( !pCursor ){
1307+    sqlite3PagerUnref(pPage);
1308+    return SQLITE_NOMEM;
1309+  }
1310+  memset(pCursor, 0, sizeof(*pCursor));
1311+
1312+  pCursor->nPageSize = nPageSize;
1313+
1314+  rc = leafCursorLoadPage(pCursor, pPage);
1315+  if( rc!=SQLITE_OK ){
1316+    sqlite3PagerUnref(pPage);
1317+    leafCursorDestroy(pCursor);
1318+    return rc;
1319+  }
1320+
1321+  /* pPage wasn't a leaf page, find the next leaf page. */
1322+  if( !pCursor->pPage ){
1323+    rc = leafCursorNextPage(pCursor);
1324+    if( rc!=SQLITE_DONE && rc!=SQLITE_ROW ){
1325+      leafCursorDestroy(pCursor);
1326+      return rc;
1327+    }
1328+  }
1329+
1330+  *ppCursor = pCursor;
1331+  return SQLITE_OK;
1332+}
1333+
1334+/* Useful for setting breakpoints. */
1335+static int ValidateError(){
1336+  return SQLITE_ERROR;
1337+}
1338+
1339+/* Setup the cursor for reading the information from cell iCell. */
1340+static int leafCursorCellDecode(RecoverLeafCursor *pCursor){
1341+  const unsigned char *pPageHeader;  /* Header of current page. */
1342+  const unsigned char *pCellOffsets; /* Pointer to page's cell offsets. */
1343+  unsigned iCellOffset;              /* Offset of current cell (iCell). */
1344+  const unsigned char *pCell;        /* Pointer to data at iCellOffset. */
1345+  unsigned nCellMaxBytes;            /* Maximum local size of iCell. */
1346+  unsigned iEndOffset;               /* End of iCell's in-page data. */
1347+  u64 nRecordBytes;                  /* Expected size of cell, w/overflow. */
1348+  u64 iRowid;                        /* iCell's rowid (in table). */
1349+  unsigned nRead;                    /* Amount of cell read. */
1350+  unsigned nRecordHeaderRead;        /* Header data read. */
1351+  u64 nRecordHeaderBytes;            /* Header size expected. */
1352+  unsigned nRecordCols;              /* Columns read from header. */
1353+  u64 nRecordColBytes;               /* Bytes in payload for those columns. */
1354+  unsigned i;
1355+  int rc;
1356+
1357+  assert( pCursor->iCell<pCursor->nCells );
1358+
1359+  leafCursorDestroyCellData(pCursor);
1360+
1361+  /* Find the offset to the row. */
1362+  pPageHeader = PageHeader(pCursor->pPage);
1363+  pCellOffsets = pPageHeader + knPageLeafHeaderBytes;
1364+  iCellOffset = decodeUnsigned16(pCellOffsets + pCursor->iCell*2);
1365+  if( iCellOffset>=pCursor->nPageSize ){
1366+    return ValidateError();
1367+  }
1368+
1369+  pCell = PageData(pCursor->pPage, iCellOffset);
1370+  nCellMaxBytes = pCursor->nPageSize - iCellOffset;
1371+
1372+  /* B-tree leaf cells lead with varint record size, varint rowid and
1373+   * varint header size.
1374+   */
1375+  /* TODO(shess): The smallest page size is 512 bytes, which has an m
1376+   * of 39.  Three varints need at most 27 bytes to encode.  I think.
1377+   */
1378+  if( !checkVarints(pCell, nCellMaxBytes, 3) ){
1379+    return ValidateError();
1380+  }
1381+
1382+  nRead = getVarint(pCell, &nRecordBytes);
1383+  assert( iCellOffset+nRead<=pCursor->nPageSize );
1384+  pCursor->nRecordBytes = nRecordBytes;
1385+
1386+  nRead += getVarint(pCell + nRead, &iRowid);
1387+  assert( iCellOffset+nRead<=pCursor->nPageSize );
1388+  pCursor->iRowid = (i64)iRowid;
1389+
1390+  pCursor->iRecordOffset = iCellOffset + nRead;
1391+
1392+  /* Start overflow setup here because nLocalRecordBytes is needed to
1393+   * check cell overlap.
1394+   */
1395+  rc = overflowMaybeCreate(pCursor->pPage, pCursor->nPageSize,
1396+                           pCursor->iRecordOffset, pCursor->nRecordBytes,
1397+                           &pCursor->nLocalRecordBytes,
1398+                           &pCursor->pOverflow);
1399+  if( rc!=SQLITE_OK ){
1400+    return ValidateError();
1401+  }
1402+
1403+  /* Check that no other cell starts within this cell. */
1404+  iEndOffset = pCursor->iRecordOffset + pCursor->nLocalRecordBytes;
1405+  for( i=0; i<pCursor->nCells; ++i ){
1406+    const unsigned iOtherOffset = decodeUnsigned16(pCellOffsets + i*2);
1407+    if( iOtherOffset>iCellOffset && iOtherOffset<iEndOffset ){
1408+      return ValidateError();
1409+    }
1410+  }
1411+
1412+  nRecordHeaderRead = getVarint(pCell + nRead, &nRecordHeaderBytes);
1413+  assert( nRecordHeaderBytes<=nRecordBytes );
1414+  pCursor->nRecordHeaderBytes = nRecordHeaderBytes;
1415+
1416+  /* Large headers could overflow if pages are small. */
1417+  rc = overflowGetSegment(pCursor->pPage,
1418+                          pCursor->iRecordOffset, pCursor->nLocalRecordBytes,
1419+                          pCursor->pOverflow, 0, nRecordHeaderBytes,
1420+                          &pCursor->pRecordHeader, &pCursor->bFreeRecordHeader);
1421+  if( rc!=SQLITE_OK ){
1422+    return ValidateError();
1423+  }
1424+
1425+  /* Tally up the column count and size of data. */
1426+  nRecordCols = 0;
1427+  nRecordColBytes = 0;
1428+  while( nRecordHeaderRead<nRecordHeaderBytes ){
1429+    u64 iSerialType;  /* Type descriptor for current column. */
1430+    if( !checkVarint(pCursor->pRecordHeader + nRecordHeaderRead,
1431+                     nRecordHeaderBytes - nRecordHeaderRead) ){
1432+      return ValidateError();
1433+    }
1434+    nRecordHeaderRead += getVarint(pCursor->pRecordHeader + nRecordHeaderRead,
1435+                                   &iSerialType);
1436+    if( iSerialType==10 || iSerialType==11 ){
1437+      return ValidateError();
1438+    }
1439+    nRecordColBytes += SerialTypeLength(iSerialType);
1440+    nRecordCols++;
1441+  }
1442+  pCursor->nRecordCols = nRecordCols;
1443+
1444+  /* Parsing the header used as many bytes as expected. */
1445+  if( nRecordHeaderRead!=nRecordHeaderBytes ){
1446+    return ValidateError();
1447+  }
1448+
1449+  /* Calculated record is size of expected record. */
1450+  if( nRecordHeaderBytes+nRecordColBytes!=nRecordBytes ){
1451+    return ValidateError();
1452+  }
1453+
1454+  return SQLITE_OK;
1455+}
1456+
1457+static i64 leafCursorCellRowid(RecoverLeafCursor *pCursor){
1458+  return pCursor->iRowid;
1459+}
1460+
1461+static unsigned leafCursorCellColumns(RecoverLeafCursor *pCursor){
1462+  return pCursor->nRecordCols;
1463+}
1464+
1465+/* Get the column info for the cell.  Pass NULL for ppBase to prevent
1466+ * retrieving the data segment.  If *pbFree is true, *ppBase must be
1467+ * freed by the caller using sqlite3_free().
1468+ */
1469+static int leafCursorCellColInfo(RecoverLeafCursor *pCursor,
1470+                                 unsigned iCol, u64 *piColType,
1471+                                 unsigned char **ppBase, int *pbFree){
1472+  const unsigned char *pRecordHeader;  /* Current cell's header. */
1473+  u64 nRecordHeaderBytes;              /* Bytes in pRecordHeader. */
1474+  unsigned nRead;                      /* Bytes read from header. */
1475+  u64 iColEndOffset;                   /* Offset to end of column in cell. */
1476+  unsigned nColsSkipped;               /* Count columns as procesed. */
1477+  u64 iSerialType;                     /* Type descriptor for current column. */
1478+
1479+  /* Implicit NULL for columns past the end.  This case happens when
1480+   * rows have not been updated since an ALTER TABLE added columns.
1481+   * It is more convenient to address here than in callers.
1482+   */
1483+  if( iCol>=pCursor->nRecordCols ){
1484+    *piColType = 0;
1485+    if( ppBase ){
1486+      *ppBase = 0;
1487+      *pbFree = 0;
1488+    }
1489+    return SQLITE_OK;
1490+  }
1491+
1492+  /* Must be able to decode header size. */
1493+  pRecordHeader = pCursor->pRecordHeader;
1494+  if( !checkVarint(pRecordHeader, pCursor->nRecordHeaderBytes) ){
1495+    return SQLITE_CORRUPT;
1496+  }
1497+
1498+  /* Rather than caching the header size and how many bytes it took,
1499+   * decode it every time.
1500+   */
1501+  nRead = getVarint(pRecordHeader, &nRecordHeaderBytes);
1502+  assert( nRecordHeaderBytes==pCursor->nRecordHeaderBytes );
1503+
1504+  /* Scan forward to the indicated column.  Scans to _after_ column
1505+   * for later range checking.
1506+   */
1507+  /* TODO(shess): This could get expensive for very wide tables.  An
1508+   * array of iSerialType could be built in leafCursorCellDecode(), but
1509+   * the number of columns is dynamic per row, so it would add memory
1510+   * management complexity.  Enough info to efficiently forward
1511+   * iterate could be kept, if all clients forward iterate
1512+   * (recoverColumn() may not).
1513+   */
1514+  iColEndOffset = 0;
1515+  nColsSkipped = 0;
1516+  while( nColsSkipped<=iCol && nRead<nRecordHeaderBytes ){
1517+    if( !checkVarint(pRecordHeader + nRead, nRecordHeaderBytes - nRead) ){
1518+      return SQLITE_CORRUPT;
1519+    }
1520+    nRead += getVarint(pRecordHeader + nRead, &iSerialType);
1521+    iColEndOffset += SerialTypeLength(iSerialType);
1522+    nColsSkipped++;
1523+  }
1524+
1525+  /* Column's data extends past record's end. */
1526+  if( nRecordHeaderBytes+iColEndOffset>pCursor->nRecordBytes ){
1527+    return SQLITE_CORRUPT;
1528+  }
1529+
1530+  *piColType = iSerialType;
1531+  if( ppBase ){
1532+    const u32 nColBytes = SerialTypeLength(iSerialType);
1533+
1534+    /* Offset from start of record to beginning of column. */
1535+    const unsigned iColOffset = nRecordHeaderBytes+iColEndOffset-nColBytes;
1536+
1537+    return overflowGetSegment(pCursor->pPage, pCursor->iRecordOffset,
1538+                              pCursor->nLocalRecordBytes, pCursor->pOverflow,
1539+                              iColOffset, nColBytes, ppBase, pbFree);
1540+  }
1541+  return SQLITE_OK;
1542+}
1543+
1544+static int leafCursorNextValidCell(RecoverLeafCursor *pCursor){
1545+  while( 1 ){
1546+    int rc;
1547+
1548+    /* Move to the next cell. */
1549+    pCursor->iCell++;
1550+
1551+    /* No more cells, get the next leaf. */
1552+    if( pCursor->iCell>=pCursor->nCells ){
1553+      rc = leafCursorNextPage(pCursor);
1554+      if( rc!=SQLITE_ROW ){
1555+        return rc;
1556+      }
1557+      assert( pCursor->iCell==0 );
1558+    }
1559+
1560+    /* If the cell is valid, indicate that a row is available. */
1561+    rc = leafCursorCellDecode(pCursor);
1562+    if( rc==SQLITE_OK ){
1563+      return SQLITE_ROW;
1564+    }
1565+
1566+    /* Iterate until done or a valid row is found. */
1567+    /* TODO(shess): Remove debugging output. */
1568+    fprintf(stderr, "Skipping invalid cell\n");
1569+  }
1570+  return SQLITE_ERROR;
1571+}
1572+
1573+typedef struct Recover Recover;
1574+struct Recover {
1575+  sqlite3_vtab base;
1576+  sqlite3 *db;                /* Host database connection */
1577+  char *zDb;                  /* Database containing target table */
1578+  char *zTable;               /* Target table */
1579+  unsigned nCols;             /* Number of columns in target table */
1580+  unsigned char *pTypes;      /* Types of columns in target table */
1581+};
1582+
1583+/* Internal helper for deleting the module. */
1584+static void recoverRelease(Recover *pRecover){
1585+  sqlite3_free(pRecover->zDb);
1586+  sqlite3_free(pRecover->zTable);
1587+  sqlite3_free(pRecover->pTypes);
1588+  memset(pRecover, 0xA5, sizeof(*pRecover));
1589+  sqlite3_free(pRecover);
1590+}
1591+
1592+/* Helper function for initializing the module.  Forward-declared so
1593+ * recoverCreate() and recoverConnect() can see it.
1594+ */
1595+static int recoverInit(
1596+  sqlite3 *, void *, int, const char *const*, sqlite3_vtab **, char **
1597+);
1598+
1599+static int recoverCreate(
1600+  sqlite3 *db,
1601+  void *pAux,
1602+  int argc, const char *const*argv,
1603+  sqlite3_vtab **ppVtab,
1604+  char **pzErr
1605+){
1606+  FNENTRY();
1607+  return recoverInit(db, pAux, argc, argv, ppVtab, pzErr);
1608+}
1609+
1610+/* This should never be called. */
1611+static int recoverConnect(
1612+  sqlite3 *db,
1613+  void *pAux,
1614+  int argc, const char *const*argv,
1615+  sqlite3_vtab **ppVtab,
1616+  char **pzErr
1617+){
1618+  FNENTRY();
1619+  return recoverInit(db, pAux, argc, argv, ppVtab, pzErr);
1620+}
1621+
1622+/* No indices supported. */
1623+static int recoverBestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){
1624+  FNENTRY();
1625+  return SQLITE_OK;
1626+}
1627+
1628+/* Logically, this should never be called. */
1629+static int recoverDisconnect(sqlite3_vtab *pVtab){
1630+  FNENTRY();
1631+  recoverRelease((Recover*)pVtab);
1632+  return SQLITE_OK;
1633+}
1634+
1635+static int recoverDestroy(sqlite3_vtab *pVtab){
1636+  FNENTRY();
1637+  recoverRelease((Recover*)pVtab);
1638+  return SQLITE_OK;
1639+}
1640+
1641+typedef struct RecoverCursor RecoverCursor;
1642+struct RecoverCursor {
1643+  sqlite3_vtab_cursor base;
1644+  RecoverLeafCursor *pLeafCursor;
1645+  int iEncoding;
1646+  int bEOF;
1647+};
1648+
1649+static int recoverOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
1650+  Recover *pRecover = (Recover*)pVTab;
1651+  u32 iRootPage;                   /* Root page of the backing table. */
1652+  int iEncoding;                   /* UTF encoding for backing database. */
1653+  unsigned nPageSize;              /* Size of pages in backing database. */
1654+  Pager *pPager;                   /* Backing database pager. */
1655+  RecoverLeafCursor *pLeafCursor;  /* Cursor to read table's leaf pages. */
1656+  RecoverCursor *pCursor;          /* Cursor to read rows from leaves. */
1657+  int rc;
1658+
1659+  FNENTRY();
1660+
1661+  iRootPage = 0;
1662+  rc = getRootPage(pRecover->db, pRecover->zDb, pRecover->zTable,
1663+                   &iRootPage);
1664+  if( rc!=SQLITE_OK ){
1665+    return rc;
1666+  }
1667+
1668+  iEncoding = 0;
1669+  rc = getEncoding(pRecover->db, pRecover->zDb, &iEncoding);
1670+  if( rc!=SQLITE_OK ){
1671+    return rc;
1672+  }
1673+
1674+  rc = GetPager(pRecover->db, pRecover->zDb, &pPager, &nPageSize);
1675+  if( rc!=SQLITE_OK ){
1676+    return rc;
1677+  }
1678+
1679+  rc = leafCursorCreate(pPager, nPageSize, iRootPage, &pLeafCursor);
1680+  if( rc!=SQLITE_OK ){
1681+    return rc;
1682+  }
1683+
1684+  pCursor = sqlite3_malloc(sizeof(RecoverCursor));
1685+  if( !pCursor ){
1686+    leafCursorDestroy(pLeafCursor);
1687+    return SQLITE_NOMEM;
1688+  }
1689+  memset(pCursor, 0, sizeof(*pCursor));
1690+  pCursor->base.pVtab = pVTab;
1691+  pCursor->pLeafCursor = pLeafCursor;
1692+  pCursor->iEncoding = iEncoding;
1693+
1694+  *ppCursor = (sqlite3_vtab_cursor*)pCursor;
1695+  return SQLITE_OK;
1696+}
1697+
1698+static int recoverClose(sqlite3_vtab_cursor *cur){
1699+  RecoverCursor *pCursor = (RecoverCursor*)cur;
1700+  FNENTRY();
1701+  if( pCursor->pLeafCursor ){
1702+    leafCursorDestroy(pCursor->pLeafCursor);
1703+    pCursor->pLeafCursor = NULL;
1704+  }
1705+  memset(pCursor, 0xA5, sizeof(*pCursor));
1706+  sqlite3_free(cur);
1707+  return SQLITE_OK;
1708+}
1709+
1710+/* Helpful place to set a breakpoint. */
1711+static int RecoverInvalidCell(){
1712+  return SQLITE_ERROR;
1713+}
1714+
1715+/* Returns SQLITE_OK if the cell has an appropriate number of columns
1716+ * with the appropriate types of data.
1717+ */
1718+static int recoverValidateLeafCell(Recover *pRecover, RecoverCursor *pCursor){
1719+  unsigned i;
1720+
1721+  /* If the row's storage has too many columns, skip it. */
1722+  if( leafCursorCellColumns(pCursor->pLeafCursor)>pRecover->nCols ){
1723+    return RecoverInvalidCell();
1724+  }
1725+
1726+  /* Skip rows with unexpected types. */
1727+  for( i=0; i<pRecover->nCols; ++i ){
1728+    u64 iType;  /* Storage type of column i. */
1729+    int rc;
1730+
1731+    /* ROWID alias. */
1732+    if( (pRecover->pTypes[i]&MASK_ROWID) ){
1733+      continue;
1734+    }
1735+
1736+    rc = leafCursorCellColInfo(pCursor->pLeafCursor, i, &iType, NULL, NULL);
1737+    assert( rc==SQLITE_OK );
1738+    if( rc!=SQLITE_OK || !SerialTypeIsCompatible(iType, pRecover->pTypes[i]) ){
1739+      return RecoverInvalidCell();
1740+    }
1741+  }
1742+
1743+  return SQLITE_OK;
1744+}
1745+
1746+static int recoverNext(sqlite3_vtab_cursor *pVtabCursor){
1747+  RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1748+  Recover *pRecover = (Recover*)pCursor->base.pVtab;
1749+  int rc;
1750+
1751+  FNENTRY();
1752+
1753+  /* Scan forward to the next cell with valid storage, then check that
1754+   * the stored data matches the schema.
1755+   */
1756+  while( (rc = leafCursorNextValidCell(pCursor->pLeafCursor))==SQLITE_ROW ){
1757+    if( recoverValidateLeafCell(pRecover, pCursor)==SQLITE_OK ){
1758+      return SQLITE_OK;
1759+    }
1760+  }
1761+
1762+  if( rc==SQLITE_DONE ){
1763+    pCursor->bEOF = 1;
1764+    return SQLITE_OK;
1765+  }
1766+
1767+  assert( rc!=SQLITE_OK );
1768+  return rc;
1769+}
1770+
1771+static int recoverFilter(
1772+  sqlite3_vtab_cursor *pVtabCursor,
1773+  int idxNum, const char *idxStr,
1774+  int argc, sqlite3_value **argv
1775+){
1776+  RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1777+  Recover *pRecover = (Recover*)pCursor->base.pVtab;
1778+  int rc;
1779+
1780+  FNENTRY();
1781+
1782+  /* Load the first cell, and iterate forward if it's not valid. */
1783+  /* TODO(shess): What happens if no cells at all are valid? */
1784+  rc = leafCursorCellDecode(pCursor->pLeafCursor);
1785+  if( rc!=SQLITE_OK || recoverValidateLeafCell(pRecover, pCursor)!=SQLITE_OK ){
1786+    return recoverNext(pVtabCursor);
1787+  }
1788+
1789+  return SQLITE_OK;
1790+}
1791+
1792+static int recoverEof(sqlite3_vtab_cursor *pVtabCursor){
1793+  RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1794+  FNENTRY();
1795+  return pCursor->bEOF;
1796+}
1797+
1798+static int recoverColumn(sqlite3_vtab_cursor *cur, sqlite3_context *ctx, int i){
1799+  RecoverCursor *pCursor = (RecoverCursor*)cur;
1800+  Recover *pRecover = (Recover*)pCursor->base.pVtab;
1801+  u64 iColType;             /* Storage type of column i. */
1802+  unsigned char *pColData;  /* Column i's data. */
1803+  int shouldFree;           /* Non-zero if pColData should be freed. */
1804+  int rc;
1805+
1806+  FNENTRY();
1807+
1808+  if( i>=pRecover->nCols ){
1809+    return SQLITE_ERROR;
1810+  }
1811+
1812+  /* ROWID alias. */
1813+  if( (pRecover->pTypes[i]&MASK_ROWID) ){
1814+    sqlite3_result_int64(ctx, leafCursorCellRowid(pCursor->pLeafCursor));
1815+    return SQLITE_OK;
1816+  }
1817+
1818+  pColData = NULL;
1819+  shouldFree = 0;
1820+  rc = leafCursorCellColInfo(pCursor->pLeafCursor, i, &iColType,
1821+                             &pColData, &shouldFree);
1822+  if( rc!=SQLITE_OK ){
1823+    return rc;
1824+  }
1825+  /* recoverValidateLeafCell() should guarantee that this will never
1826+   * occur.
1827+   */
1828+  if( !SerialTypeIsCompatible(iColType, pRecover->pTypes[i]) ){
1829+    if( shouldFree ){
1830+      sqlite3_free(pColData);
1831+    }
1832+    return SQLITE_ERROR;
1833+  }
1834+
1835+  switch( iColType ){
1836+    case 0 : sqlite3_result_null(ctx); break;
1837+    case 1 : sqlite3_result_int64(ctx, decodeSigned(pColData, 1)); break;
1838+    case 2 : sqlite3_result_int64(ctx, decodeSigned(pColData, 2)); break;
1839+    case 3 : sqlite3_result_int64(ctx, decodeSigned(pColData, 3)); break;
1840+    case 4 : sqlite3_result_int64(ctx, decodeSigned(pColData, 4)); break;
1841+    case 5 : sqlite3_result_int64(ctx, decodeSigned(pColData, 6)); break;
1842+    case 6 : sqlite3_result_int64(ctx, decodeSigned(pColData, 8)); break;
1843+    case 7 : sqlite3_result_double(ctx, decodeFloat64(pColData)); break;
1844+    case 8 : sqlite3_result_int(ctx, 0); break;
1845+    case 9 : sqlite3_result_int(ctx, 1); break;
1846+    case 10 : assert( iColType!=10 ); break;
1847+    case 11 : assert( iColType!=11 ); break;
1848+
1849+    default : {
1850+      u32 l = SerialTypeLength(iColType);
1851+
1852+      /* If pColData was already allocated, arrange to pass ownership. */
1853+      sqlite3_destructor_type pFn = SQLITE_TRANSIENT;
1854+      if( shouldFree ){
1855+        pFn = sqlite3_free;
1856+        shouldFree = 0;
1857+      }
1858+
1859+      if( SerialTypeIsBlob(iColType) ){
1860+        sqlite3_result_blob(ctx, pColData, l, pFn);
1861+      }else{
1862+        if( pCursor->iEncoding==SQLITE_UTF16LE ){
1863+          sqlite3_result_text16le(ctx, (const void*)pColData, l, pFn);
1864+        }else if( pCursor->iEncoding==SQLITE_UTF16BE ){
1865+          sqlite3_result_text16be(ctx, (const void*)pColData, l, pFn);
1866+        }else{
1867+          sqlite3_result_text(ctx, (const char*)pColData, l, pFn);
1868+        }
1869+      }
1870+    } break;
1871+  }
1872+  if( shouldFree ){
1873+    sqlite3_free(pColData);
1874+  }
1875+  return SQLITE_OK;
1876+}
1877+
1878+static int recoverRowid(sqlite3_vtab_cursor *pVtabCursor, sqlite_int64 *pRowid){
1879+  RecoverCursor *pCursor = (RecoverCursor*)pVtabCursor;
1880+  FNENTRY();
1881+  *pRowid = leafCursorCellRowid(pCursor->pLeafCursor);
1882+  return SQLITE_OK;
1883+}
1884+
1885+static sqlite3_module recoverModule = {
1886+  0,                         /* iVersion */
1887+  recoverCreate,             /* xCreate - create a table */
1888+  recoverConnect,            /* xConnect - connect to an existing table */
1889+  recoverBestIndex,          /* xBestIndex - Determine search strategy */
1890+  recoverDisconnect,         /* xDisconnect - Disconnect from a table */
1891+  recoverDestroy,            /* xDestroy - Drop a table */
1892+  recoverOpen,               /* xOpen - open a cursor */
1893+  recoverClose,              /* xClose - close a cursor */
1894+  recoverFilter,             /* xFilter - configure scan constraints */
1895+  recoverNext,               /* xNext - advance a cursor */
1896+  recoverEof,                /* xEof */
1897+  recoverColumn,             /* xColumn - read data */
1898+  recoverRowid,              /* xRowid - read data */
1899+  0,                         /* xUpdate - write data */
1900+  0,                         /* xBegin - begin transaction */
1901+  0,                         /* xSync - sync transaction */
1902+  0,                         /* xCommit - commit transaction */
1903+  0,                         /* xRollback - rollback transaction */
1904+  0,                         /* xFindFunction - function overloading */
1905+  0,                         /* xRename - rename the table */
1906+};
1907+
1908+int recoverVtableInit(sqlite3 *db){
1909+  return sqlite3_create_module_v2(db, "recover", &recoverModule, NULL, 0);
1910+}
1911+
1912+/* This section of code is for parsing the create input and
1913+ * initializing the module.
1914+ */
1915+
1916+/* Find the next word in zText and place the endpoints in pzWord*.
1917+ * Returns true if the word is non-empty.  "Word" is defined as
1918+ * ASCII alphanumeric plus '_' at this time.
1919+ */
1920+static int findWord(const char *zText,
1921+                    const char **pzWordStart, const char **pzWordEnd){
1922+  int r;
1923+  while( ascii_isspace(*zText) ){
1924+    zText++;
1925+  }
1926+  *pzWordStart = zText;
1927+  while( ascii_isalnum(*zText) || *zText=='_' ){
1928+    zText++;
1929+  }
1930+  r = zText>*pzWordStart;  /* In case pzWordStart==pzWordEnd */
1931+  *pzWordEnd = zText;
1932+  return r;
1933+}
1934+
1935+/* Return true if the next word in zText is zWord, also setting
1936+ * *pzContinue to the character after the word.
1937+ */
1938+static int expectWord(const char *zText, const char *zWord,
1939+                      const char **pzContinue){
1940+  const char *zWordStart, *zWordEnd;
1941+  if( findWord(zText, &zWordStart, &zWordEnd) &&
1942+      ascii_strncasecmp(zWord, zWordStart, zWordEnd - zWordStart)==0 ){
1943+    *pzContinue = zWordEnd;
1944+    return 1;
1945+  }
1946+  return 0;
1947+}
1948+
1949+/* Parse the name and type information out of parameter.  In case of
1950+ * success, *pzNameStart/End contain the name of the column,
1951+ * *pzTypeStart/End contain the top-level type, and *pTypeMask has the
1952+ * type mask to use for the column.
1953+ */
1954+static int findNameAndType(const char *parameter,
1955+                           const char **pzNameStart, const char **pzNameEnd,
1956+                           const char **pzTypeStart, const char **pzTypeEnd,
1957+                           unsigned char *pTypeMask){
1958+  unsigned nNameLen;   /* Length of found name. */
1959+  const char *zEnd;    /* Current end of parsed column information. */
1960+  int bNotNull;        /* Non-zero if NULL is not allowed for name. */
1961+  int bStrict;         /* Non-zero if column requires exact type match. */
1962+  const char *zDummy;  /* Dummy parameter, result unused. */
1963+  unsigned i;
1964+
1965+  /* strictMask is used for STRICT, strictMask|otherMask if STRICT is
1966+   * not supplied.  zReplace provides an alternate type to expose to
1967+   * the caller.
1968+   */
1969+  static struct {
1970+    const char *zName;
1971+    unsigned char strictMask;
1972+    unsigned char otherMask;
1973+    const char *zReplace;
1974+  } kTypeInfo[] = {
1975+    { "ANY",
1976+      MASK_INTEGER | MASK_FLOAT | MASK_BLOB | MASK_TEXT | MASK_NULL,
1977+      0, "",
1978+    },
1979+    { "ROWID",   MASK_INTEGER | MASK_ROWID,             0, "INTEGER", },
1980+    { "INTEGER", MASK_INTEGER | MASK_NULL,              0, NULL, },
1981+    { "FLOAT",   MASK_FLOAT | MASK_NULL,                MASK_INTEGER, NULL, },
1982+    { "NUMERIC", MASK_INTEGER | MASK_FLOAT | MASK_NULL, MASK_TEXT, NULL, },
1983+    { "TEXT",    MASK_TEXT | MASK_NULL,                 MASK_BLOB, NULL, },
1984+    { "BLOB",    MASK_BLOB | MASK_NULL,                 0, NULL, },
1985+  };
1986+
1987+  if( !findWord(parameter, pzNameStart, pzNameEnd) ){
1988+    return SQLITE_MISUSE;
1989+  }
1990+
1991+  /* Manifest typing, accept any storage type. */
1992+  if( !findWord(*pzNameEnd, pzTypeStart, pzTypeEnd) ){
1993+    *pzTypeEnd = *pzTypeStart = "";
1994+    *pTypeMask = MASK_INTEGER | MASK_FLOAT | MASK_BLOB | MASK_TEXT | MASK_NULL;
1995+    return SQLITE_OK;
1996+  }
1997+
1998+  nNameLen = *pzTypeEnd - *pzTypeStart;
1999+  for( i=0; i<ArraySize(kTypeInfo); ++i ){
2000+    if( ascii_strncasecmp(kTypeInfo[i].zName, *pzTypeStart, nNameLen)==0 ){
2001+      break;
2002+    }
2003+  }
2004+  if( i==ArraySize(kTypeInfo) ){
2005+    return SQLITE_MISUSE;
2006+  }
2007+
2008+  zEnd = *pzTypeEnd;
2009+  bStrict = 0;
2010+  if( expectWord(zEnd, "STRICT", &zEnd) ){
2011+    /* TODO(shess): Ick.  But I don't want another single-purpose
2012+     * flag, either.
2013+     */
2014+    if( kTypeInfo[i].zReplace && !kTypeInfo[i].zReplace[0] ){
2015+      return SQLITE_MISUSE;
2016+    }
2017+    bStrict = 1;
2018+  }
2019+
2020+  bNotNull = 0;
2021+  if( expectWord(zEnd, "NOT", &zEnd) ){
2022+    if( expectWord(zEnd, "NULL", &zEnd) ){
2023+      bNotNull = 1;
2024+    }else{
2025+      /* Anything other than NULL after NOT is an error. */
2026+      return SQLITE_MISUSE;
2027+    }
2028+  }
2029+
2030+  /* Anything else is an error. */
2031+  if( findWord(zEnd, &zDummy, &zDummy) ){
2032+    return SQLITE_MISUSE;
2033+  }
2034+
2035+  *pTypeMask = kTypeInfo[i].strictMask;
2036+  if( !bStrict ){
2037+    *pTypeMask |= kTypeInfo[i].otherMask;
2038+  }
2039+  if( bNotNull ){
2040+    *pTypeMask &= ~MASK_NULL;
2041+  }
2042+  if( kTypeInfo[i].zReplace ){
2043+    *pzTypeStart = kTypeInfo[i].zReplace;
2044+    *pzTypeEnd = *pzTypeStart + strlen(*pzTypeStart);
2045+  }
2046+  return SQLITE_OK;
2047+}
2048+
2049+/* Parse the arguments, placing type masks in *pTypes and the exposed
2050+ * schema in *pzCreateSql (for sqlite3_declare_vtab).
2051+ */
2052+static int ParseColumnsAndGenerateCreate(unsigned nCols,
2053+                                         const char *const *pCols,
2054+                                         char **pzCreateSql,
2055+                                         unsigned char *pTypes,
2056+                                         char **pzErr){
2057+  unsigned i;
2058+  char *zCreateSql = sqlite3_mprintf("CREATE TABLE x(");
2059+  if( !zCreateSql ){
2060+    return SQLITE_NOMEM;
2061+  }
2062+
2063+  for( i=0; i<nCols; i++ ){
2064+    const char *zSep = (i < nCols - 1 ? ", " : ")");
2065+    const char *zNotNull = "";
2066+    const char *zNameStart, *zNameEnd;
2067+    const char *zTypeStart, *zTypeEnd;
2068+    int rc = findNameAndType(pCols[i],
2069+                             &zNameStart, &zNameEnd,
2070+                             &zTypeStart, &zTypeEnd,
2071+                             &pTypes[i]);
2072+    if( rc!=SQLITE_OK ){
2073+      *pzErr = sqlite3_mprintf("unable to parse column %d", i);
2074+      sqlite3_free(zCreateSql);
2075+      return rc;
2076+    }
2077+
2078+    if( !(pTypes[i]&MASK_NULL) ){
2079+      zNotNull = " NOT NULL";
2080+    }
2081+
2082+    /* Add name and type to the create statement. */
2083+    zCreateSql = sqlite3_mprintf("%z%.*s %.*s%s%s",
2084+                                 zCreateSql,
2085+                                 zNameEnd - zNameStart, zNameStart,
2086+                                 zTypeEnd - zTypeStart, zTypeStart,
2087+                                 zNotNull, zSep);
2088+    if( !zCreateSql ){
2089+      return SQLITE_NOMEM;
2090+    }
2091+  }
2092+
2093+  *pzCreateSql = zCreateSql;
2094+  return SQLITE_OK;
2095+}
2096+
2097+/* Helper function for initializing the module. */
2098+/* argv[0] module name
2099+ * argv[1] db name for virtual table
2100+ * argv[2] virtual table name
2101+ * argv[3] backing table name
2102+ * argv[4] columns
2103+ */
2104+/* TODO(shess): Since connect isn't supported, could inline into
2105+ * recoverCreate().
2106+ */
2107+/* TODO(shess): Explore cases where it would make sense to set *pzErr. */
2108+static int recoverInit(
2109+  sqlite3 *db,                        /* Database connection */
2110+  void *pAux,                         /* unused */
2111+  int argc, const char *const*argv,   /* Parameters to CREATE TABLE statement */
2112+  sqlite3_vtab **ppVtab,              /* OUT: New virtual table */
2113+  char **pzErr                        /* OUT: Error message, if any */
2114+){
2115+  const unsigned kTypeCol = 4;  /* First argument with column type info. */
2116+  Recover *pRecover;            /* Virtual table structure being created. */
2117+  char *zDot;                   /* Any dot found in "db.table" backing. */
2118+  u32 iRootPage;                /* Root page of backing table. */
2119+  char *zCreateSql;             /* Schema of created virtual table. */
2120+  int rc;
2121+
2122+  /* Require to be in the temp database. */
2123+  if( ascii_strcasecmp(argv[1], "temp")!=0 ){
2124+    *pzErr = sqlite3_mprintf("recover table must be in temp database");
2125+    return SQLITE_MISUSE;
2126+  }
2127+
2128+  /* Need the backing table and at least one column. */
2129+  if( argc<=kTypeCol ){
2130+    *pzErr = sqlite3_mprintf("no columns specified");
2131+    return SQLITE_MISUSE;
2132+  }
2133+
2134+  pRecover = sqlite3_malloc(sizeof(Recover));
2135+  if( !pRecover ){
2136+    return SQLITE_NOMEM;
2137+  }
2138+  memset(pRecover, 0, sizeof(*pRecover));
2139+  pRecover->base.pModule = &recoverModule;
2140+  pRecover->db = db;
2141+
2142+  /* Parse out db.table, assuming main if no dot. */
2143+  zDot = strchr(argv[3], '.');
2144+  if( !zDot ){
2145+    pRecover->zDb = sqlite3_strdup(db->aDb[0].zName);
2146+    pRecover->zTable = sqlite3_strdup(argv[3]);
2147+  }else if( zDot>argv[3] && zDot[1]!='\0' ){
2148+    pRecover->zDb = sqlite3_strndup(argv[3], zDot - argv[3]);
2149+    pRecover->zTable = sqlite3_strdup(zDot + 1);
2150+  }else{
2151+    /* ".table" or "db." not allowed. */
2152+    *pzErr = sqlite3_mprintf("ill-formed table specifier");
2153+    recoverRelease(pRecover);
2154+    return SQLITE_ERROR;
2155+  }
2156+
2157+  pRecover->nCols = argc - kTypeCol;
2158+  pRecover->pTypes = sqlite3_malloc(pRecover->nCols);
2159+  if( !pRecover->zDb || !pRecover->zTable || !pRecover->pTypes ){
2160+    recoverRelease(pRecover);
2161+    return SQLITE_NOMEM;
2162+  }
2163+
2164+  /* Require the backing table to exist. */
2165+  /* TODO(shess): Be more pedantic about the form of the descriptor
2166+   * string.  This already fails for poorly-formed strings, simply
2167+   * because there won't be a root page, but it would make more sense
2168+   * to be explicit.
2169+   */
2170+  rc = getRootPage(pRecover->db, pRecover->zDb, pRecover->zTable, &iRootPage);
2171+  if( rc!=SQLITE_OK ){
2172+    *pzErr = sqlite3_mprintf("unable to find backing table");
2173+    recoverRelease(pRecover);
2174+    return rc;
2175+  }
2176+
2177+  /* Parse the column definitions. */
2178+  rc = ParseColumnsAndGenerateCreate(pRecover->nCols, argv + kTypeCol,
2179+                                     &zCreateSql, pRecover->pTypes, pzErr);
2180+  if( rc!=SQLITE_OK ){
2181+    recoverRelease(pRecover);
2182+    return rc;
2183+  }
2184+
2185+  rc = sqlite3_declare_vtab(db, zCreateSql);
2186+  sqlite3_free(zCreateSql);
2187+  if( rc!=SQLITE_OK ){
2188+    recoverRelease(pRecover);
2189+    return rc;
2190+  }
2191+
2192+  *ppVtab = (sqlite3_vtab *)pRecover;
2193+  return SQLITE_OK;
2194+}
2195