• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# 2009 November 04
2#
3# The author disclaims copyright to this source code.  In place of
4# a legal notice, here is a blessing:
5#
6#    May you do good and not evil.
7#    May you find forgiveness for yourself and forgive others.
8#    May you share freely, never taking more than you give.
9#
10#***********************************************************************
11#
12# This file contains common code used the fts3 tests. At one point
13# equivalent functionality was implemented in C code. But it is easier
14# to use Tcl.
15#
16
17#-------------------------------------------------------------------------
18# USAGE: fts3_integrity_check TBL
19#
20# This proc is used to verify that the full-text index is consistent with
21# the contents of the fts3 table. In other words, it checks that the
22# data in the %_contents table matches that in the %_segdir and %_segments
23# tables.
24#
25# This is not an efficient procedure. It uses a lot of memory and a lot
26# of CPU. But it is better than not checking at all.
27#
28# The procedure is:
29#
30#   1) Read the entire full-text index from the %_segdir and %_segments
31#      tables into memory. For each entry in the index, the following is
32#      done:
33#
34#          set C($iDocid,$iCol,$iPosition) $zTerm
35#
36#   2) Iterate through each column of each row of the %_content table.
37#      Tokenize all documents, and check that for each token there is
38#      a corresponding entry in the $C array. After checking a token,
39#      [unset] the $C array entry.
40#
41#   3) Check that array $C is now empty.
42#
43#
44proc fts3_integrity_check {tbl} {
45
46  fts3_read2 $tbl 1 A
47
48  foreach zTerm [array names A] {
49    foreach doclist $A($zTerm) {
50      set docid 0
51      while {[string length $doclist]>0} {
52        set iCol 0
53        set iPos 0
54        set lPos [list]
55        set lCol [list]
56
57        # First varint of a doclist-entry is the docid. Delta-compressed
58        # with respect to the docid of the previous entry.
59        #
60        incr docid [gobble_varint doclist]
61        if {[info exists D($zTerm,$docid)]} {
62          while {[set iDelta [gobble_varint doclist]] != 0} {}
63          continue
64        }
65        set D($zTerm,$docid) 1
66
67        # Gobble varints until the 0x00 that terminates the doclist-entry
68        # is found.
69        while {[set iDelta [gobble_varint doclist]] > 0} {
70          if {$iDelta == 1} {
71            set iCol [gobble_varint doclist]
72            set iPos 0
73          } else {
74            incr iPos $iDelta
75            incr iPos -2
76            set C($docid,$iCol,$iPos) $zTerm
77          }
78        }
79      }
80    }
81  }
82
83  foreach key [array names C] {
84    #puts "$key -> $C($key)"
85  }
86
87
88  db eval "SELECT * FROM ${tbl}_content" E {
89    set iCol 0
90    set iDoc $E(docid)
91    foreach col [lrange $E(*) 1 end] {
92      set c $E($col)
93      set sql {SELECT fts3_tokenizer_test('simple', $c)}
94
95      foreach {pos term dummy} [db one $sql] {
96        if {![info exists C($iDoc,$iCol,$pos)]} {
97          set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing"
98          lappend errors $es
99        } else {
100          if {$C($iDoc,$iCol,$pos) != "$term"} {
101            set    es "Error at docid=$iDoc col=$iCol pos=$pos. Index "
102            append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\""
103            lappend errors $es
104          }
105          unset C($iDoc,$iCol,$pos)
106        }
107      }
108      incr iCol
109    }
110  }
111
112  foreach c [array names C] {
113    lappend errors "Bad index entry: $c -> $C($c)"
114  }
115
116  if {[info exists errors]} { return [join $errors "\n"] }
117  return "ok"
118}
119
120# USAGE: fts3_terms TBL WHERE
121#
122# Argument TBL must be the name of an FTS3 table. Argument WHERE is an
123# SQL expression that will be used as the WHERE clause when scanning
124# the %_segdir table. As in the following query:
125#
126#   "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}"
127#
128# This function returns a list of all terms present in the segments
129# selected by the statement above.
130#
131proc fts3_terms {tbl where} {
132  fts3_read $tbl $where a
133  return [lsort [array names a]]
134}
135
136
137# USAGE: fts3_doclist TBL TERM WHERE
138#
139# Argument TBL must be the name of an FTS3 table. TERM is a term that may
140# or may not be present in the table. Argument WHERE is used to select a
141# subset of the b-tree segments in the associated full-text index as
142# described above for [fts3_terms].
143#
144# This function returns the results of merging the doclists associated
145# with TERM in the selected segments. Each doclist is an element of the
146# returned list. Each doclist is formatted as follows:
147#
148#   [$docid ?$col[$off1 $off2...]?...]
149#
150# The formatting is odd for a Tcl command in order to be compatible with
151# the original C-language implementation. If argument WHERE is "1", then
152# any empty doclists are omitted from the returned list.
153#
154proc fts3_doclist {tbl term where} {
155  fts3_read $tbl $where a
156
157
158  foreach doclist $a($term) {
159    set docid 0
160
161    while {[string length $doclist]>0} {
162      set iCol 0
163      set iPos 0
164      set lPos [list]
165      set lCol [list]
166      incr docid [gobble_varint doclist]
167
168      while {[set iDelta [gobble_varint doclist]] > 0} {
169        if {$iDelta == 1} {
170          lappend lCol [list $iCol $lPos]
171          set iPos 0
172          set lPos [list]
173          set iCol [gobble_varint doclist]
174        } else {
175          incr iPos $iDelta
176          incr iPos -2
177          lappend lPos $iPos
178        }
179      }
180
181      if {[llength $lPos]>0} {
182        lappend lCol [list $iCol $lPos]
183      }
184
185      if {$where != "1" || [llength $lCol]>0} {
186        set ret($docid) $lCol
187      } else {
188        unset -nocomplain ret($docid)
189      }
190    }
191  }
192
193  set lDoc [list]
194  foreach docid [lsort -integer [array names ret]] {
195    set lCol [list]
196    set cols ""
197    foreach col $ret($docid) {
198      foreach {iCol lPos} $col {}
199      append cols " $iCol\[[join $lPos { }]\]"
200    }
201    lappend lDoc "\[${docid}${cols}\]"
202  }
203
204  join $lDoc " "
205}
206
207###########################################################################
208
209proc gobble_varint {varname} {
210  upvar $varname blob
211  set n [read_fts3varint $blob ret]
212  set blob [string range $blob $n end]
213  return $ret
214}
215proc gobble_string {varname nLength} {
216  upvar $varname blob
217  set ret [string range $blob 0 [expr $nLength-1]]
218  set blob [string range $blob $nLength end]
219  return $ret
220}
221
222# The argument is a blob of data representing an FTS3 segment leaf.
223# Return a list consisting of alternating terms (strings) and doclists
224# (blobs of data).
225#
226proc fts3_readleaf {blob} {
227  set zPrev ""
228  set terms [list]
229
230  while {[string length $blob] > 0} {
231    set nPrefix [gobble_varint blob]
232    set nSuffix [gobble_varint blob]
233
234    set zTerm [string range $zPrev 0 [expr $nPrefix-1]]
235    append zTerm [gobble_string blob $nSuffix]
236    set doclist [gobble_string blob [gobble_varint blob]]
237
238    lappend terms $zTerm $doclist
239    set zPrev $zTerm
240  }
241
242  return $terms
243}
244
245proc fts3_read2 {tbl where varname} {
246  upvar $varname a
247  array unset a
248  db eval " SELECT start_block, leaves_end_block, root
249            FROM ${tbl}_segdir WHERE $where
250            ORDER BY level ASC, idx DESC
251  " {
252    if {$start_block == 0} {
253      foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
254    } else {
255      db eval " SELECT block
256                FROM ${tbl}_segments
257                WHERE blockid>=$start_block AND blockid<=$leaves_end_block
258                ORDER BY blockid
259      " {
260        foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
261
262      }
263    }
264  }
265}
266
267proc fts3_read {tbl where varname} {
268  upvar $varname a
269  array unset a
270  db eval " SELECT start_block, leaves_end_block, root
271            FROM ${tbl}_segdir WHERE $where
272            ORDER BY level DESC, idx ASC
273  " {
274    if {$start_block == 0} {
275      foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
276    } else {
277      db eval " SELECT block
278                FROM ${tbl}_segments
279                WHERE blockid>=$start_block AND blockid<$leaves_end_block
280                ORDER BY blockid
281      " {
282        foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
283
284      }
285    }
286  }
287}
288
289##########################################################################
290
291