• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #!/usr/bin/python -u
2 #
3 # imports the API description and fills up a database with
4 # name relevance to modules, functions or web pages
5 #
6 # Operation needed:
7 # =================
8 #
9 # install mysqld, the python wrappers for mysql and libxml2, start mysqld
10 # Change the root passwd of mysql:
11 #    mysqladmin -u root password new_password
12 # Create the new database xmlsoft
13 #    mysqladmin -p create xmlsoft
14 # Create a database user 'veillard' and give him passord access
15 # change veillard and abcde with the right user name and passwd
16 #    mysql -p
17 #    password:
18 #    mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
19 #           IDENTIFIED BY 'abcde' WITH GRANT OPTION;
20 #
21 # As the user check the access:
22 #    mysql -p xmlsoft
23 #    Enter password:
24 #    Welcome to the MySQL monitor....
25 #    mysql> use xmlsoft
26 #    Database changed
27 #    mysql> quit
28 #    Bye
29 #
30 # Then run the script in the doc subdir, it will create the symbols and
31 # word tables and populate them with information extracted from
32 # the libxml2-api.xml API description, and make them accessible read-only
33 # by nobody@loaclhost the user expected to be Apache's one
34 #
35 # On the Apache configuration, make sure you have php support enabled
36 #
37 
38 import MySQLdb
39 import libxml2
40 import sys
41 import string
42 import os
43 
44 #
45 # We are not interested in parsing errors here
46 #
47 def callback(ctx, str):
48     return
49 libxml2.registerErrorHandler(callback, None)
50 
51 #
52 # The dictionary of tables required and the SQL command needed
53 # to create them
54 #
55 TABLES={
56   "symbols" : """CREATE TABLE symbols (
57            name varchar(255) BINARY NOT NULL,
58 	   module varchar(255) BINARY NOT NULL,
59            type varchar(25) NOT NULL,
60 	   descr varchar(255),
61 	   UNIQUE KEY name (name),
62 	   KEY module (module))""",
63   "words" : """CREATE TABLE words (
64            name varchar(50) BINARY NOT NULL,
65 	   symbol varchar(255) BINARY NOT NULL,
66            relevance int,
67 	   KEY name (name),
68 	   KEY symbol (symbol),
69 	   UNIQUE KEY ID (name, symbol))""",
70   "wordsHTML" : """CREATE TABLE wordsHTML (
71            name varchar(50) BINARY NOT NULL,
72 	   resource varchar(255) BINARY NOT NULL,
73 	   section varchar(255),
74 	   id varchar(50),
75            relevance int,
76 	   KEY name (name),
77 	   KEY resource (resource),
78 	   UNIQUE KEY ref (name, resource))""",
79   "wordsArchive" : """CREATE TABLE wordsArchive (
80            name varchar(50) BINARY NOT NULL,
81 	   ID int(11) NOT NULL,
82            relevance int,
83 	   KEY name (name),
84 	   UNIQUE KEY ref (name, ID))""",
85   "pages" : """CREATE TABLE pages (
86            resource varchar(255) BINARY NOT NULL,
87 	   title varchar(255) BINARY NOT NULL,
88 	   UNIQUE KEY name (resource))""",
89   "archives" : """CREATE TABLE archives (
90            ID int(11) NOT NULL auto_increment,
91            resource varchar(255) BINARY NOT NULL,
92 	   title varchar(255) BINARY NOT NULL,
93 	   UNIQUE KEY id (ID,resource(255)),
94 	   INDEX (ID),
95 	   INDEX (resource))""",
96   "Queries" : """CREATE TABLE Queries (
97            ID int(11) NOT NULL auto_increment,
98 	   Value varchar(50) NOT NULL,
99 	   Count int(11) NOT NULL,
100 	   UNIQUE KEY id (ID,Value(35)),
101 	   INDEX (ID))""",
102   "AllQueries" : """CREATE TABLE AllQueries (
103            ID int(11) NOT NULL auto_increment,
104 	   Value varchar(50) NOT NULL,
105 	   Count int(11) NOT NULL,
106 	   UNIQUE KEY id (ID,Value(35)),
107 	   INDEX (ID))""",
108 }
109 
110 #
111 # The XML API description file to parse
112 #
113 API="libxml2-api.xml"
114 DB=None
115 
116 #########################################################################
117 #									#
118 #                  MySQL database interfaces				#
119 #									#
120 #########################################################################
121 def createTable(db, name):
122     global TABLES
123 
124     if db == None:
125         return -1
126     if name == None:
127         return -1
128     c = db.cursor()
129 
130     ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
131     if ret == 1:
132         print "Removed table %s" % (name)
133     print "Creating table %s" % (name)
134     try:
135         ret = c.execute(TABLES[name])
136     except:
137         print "Failed to create table %s" % (name)
138 	return -1
139     return ret
140 
141 def checkTables(db, verbose = 1):
142     global TABLES
143 
144     if db == None:
145         return -1
146     c = db.cursor()
147     nbtables = c.execute("show tables")
148     if verbose:
149 	print "Found %d tables" % (nbtables)
150     tables = {}
151     i = 0
152     while i < nbtables:
153         l = c.fetchone()
154 	name = l[0]
155 	tables[name] = {}
156         i = i + 1
157 
158     for table in TABLES.keys():
159         if not tables.has_key(table):
160 	    print "table %s missing" % (table)
161 	    createTable(db, table)
162 	try:
163 	    ret = c.execute("SELECT count(*) from %s" % table);
164 	    row = c.fetchone()
165 	    if verbose:
166 		print "Table %s contains %d records" % (table, row[0])
167 	except:
168 	    print "Troubles with table %s : repairing" % (table)
169 	    ret = c.execute("repair table %s" % table);
170 	    print "repairing returned %d" % (ret)
171 	    ret = c.execute("SELECT count(*) from %s" % table);
172 	    row = c.fetchone()
173 	    print "Table %s contains %d records" % (table, row[0])
174     if verbose:
175 	print "checkTables finished"
176 
177     # make sure apache can access the tables read-only
178     try:
179 	ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
180 	ret = c.execute("GRANT INSERT,SELECT,UPDATE  ON xmlsoft.Queries TO nobody@localhost")
181     except:
182         pass
183     return 0
184 
185 def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
186     global DB
187 
188     if passwd == None:
189         try:
190 	    passwd = os.environ["MySQL_PASS"]
191 	except:
192 	    print "No password available, set environment MySQL_PASS"
193 	    sys.exit(1)
194 
195     DB = MySQLdb.connect(passwd=passwd, db=db)
196     if DB == None:
197         return -1
198     ret = checkTables(DB, verbose)
199     return ret
200 
201 def updateWord(name, symbol, relevance):
202     global DB
203 
204     if DB == None:
205         openMySQL()
206     if DB == None:
207         return -1
208     if name == None:
209         return -1
210     if symbol == None:
211         return -1
212 
213     c = DB.cursor()
214     try:
215 	ret = c.execute(
216 """INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
217 		(name, symbol, relevance))
218     except:
219         try:
220 	    ret = c.execute(
221     """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
222 		    (relevance, name, symbol))
223 	except:
224 	    print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
225 	    print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
226 	    print sys.exc_type, sys.exc_value
227 	    return -1
228 
229     return ret
230 
231 def updateSymbol(name, module, type, desc):
232     global DB
233 
234     updateWord(name, name, 50)
235     if DB == None:
236         openMySQL()
237     if DB == None:
238         return -1
239     if name == None:
240         return -1
241     if module == None:
242         return -1
243     if type == None:
244         return -1
245 
246     try:
247 	desc = string.replace(desc, "'", " ")
248 	l = string.split(desc, ".")
249 	desc = l[0]
250 	desc = desc[0:99]
251     except:
252         desc = ""
253 
254     c = DB.cursor()
255     try:
256 	ret = c.execute(
257 """INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
258                     (name, module, type, desc))
259     except:
260         try:
261 	    ret = c.execute(
262 """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
263                     (module, type, desc, name))
264         except:
265 	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
266 	    print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
267 	    print sys.exc_type, sys.exc_value
268 	    return -1
269 
270     return ret
271 
272 def addFunction(name, module, desc = ""):
273     return updateSymbol(name, module, 'function', desc)
274 
275 def addMacro(name, module, desc = ""):
276     return updateSymbol(name, module, 'macro', desc)
277 
278 def addEnum(name, module, desc = ""):
279     return updateSymbol(name, module, 'enum', desc)
280 
281 def addStruct(name, module, desc = ""):
282     return updateSymbol(name, module, 'struct', desc)
283 
284 def addConst(name, module, desc = ""):
285     return updateSymbol(name, module, 'const', desc)
286 
287 def addType(name, module, desc = ""):
288     return updateSymbol(name, module, 'type', desc)
289 
290 def addFunctype(name, module, desc = ""):
291     return updateSymbol(name, module, 'functype', desc)
292 
293 def addPage(resource, title):
294     global DB
295 
296     if DB == None:
297         openMySQL()
298     if DB == None:
299         return -1
300     if resource == None:
301         return -1
302 
303     c = DB.cursor()
304     try:
305 	ret = c.execute(
306 	    """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
307                     (resource, title))
308     except:
309         try:
310 	    ret = c.execute(
311 		"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
312                     (title, resource))
313         except:
314 	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
315 	    print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
316 	    print sys.exc_type, sys.exc_value
317 	    return -1
318 
319     return ret
320 
321 def updateWordHTML(name, resource, desc, id, relevance):
322     global DB
323 
324     if DB == None:
325         openMySQL()
326     if DB == None:
327         return -1
328     if name == None:
329         return -1
330     if resource == None:
331         return -1
332     if id == None:
333         id = ""
334     if desc == None:
335         desc = ""
336     else:
337 	try:
338 	    desc = string.replace(desc, "'", " ")
339 	    desc = desc[0:99]
340 	except:
341 	    desc = ""
342 
343     c = DB.cursor()
344     try:
345 	ret = c.execute(
346 """INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
347                     (name, resource, desc, id, relevance))
348     except:
349         try:
350 	    ret = c.execute(
351 """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
352                     (desc, id, relevance, name, resource))
353         except:
354 	    print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
355 	    print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
356 	    print sys.exc_type, sys.exc_value
357 	    return -1
358 
359     return ret
360 
361 def checkXMLMsgArchive(url):
362     global DB
363 
364     if DB == None:
365         openMySQL()
366     if DB == None:
367         return -1
368     if url == None:
369         return -1
370 
371     c = DB.cursor()
372     try:
373 	ret = c.execute(
374 	    """SELECT ID FROM archives WHERE resource='%s'""" % (url))
375 	row = c.fetchone()
376 	if row == None:
377 	    return -1
378     except:
379 	return -1
380 
381     return row[0]
382 
383 def addXMLMsgArchive(url, title):
384     global DB
385 
386     if DB == None:
387         openMySQL()
388     if DB == None:
389         return -1
390     if url == None:
391         return -1
392     if title == None:
393         title = ""
394     else:
395 	title = string.replace(title, "'", " ")
396 	title = title[0:99]
397 
398     c = DB.cursor()
399     try:
400         cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
401         ret = c.execute(cmd)
402 	cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
403         ret = c.execute(cmd)
404 	row = c.fetchone()
405 	if row == None:
406 	    print "addXMLMsgArchive failed to get the ID: %s" % (url)
407 	    return -1
408     except:
409         print "addXMLMsgArchive failed command: %s" % (cmd)
410 	return -1
411 
412     return((int)(row[0]))
413 
414 def updateWordArchive(name, id, relevance):
415     global DB
416 
417     if DB == None:
418         openMySQL()
419     if DB == None:
420         return -1
421     if name == None:
422         return -1
423     if id == None:
424         return -1
425 
426     c = DB.cursor()
427     try:
428 	ret = c.execute(
429 """INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
430                     (name, id, relevance))
431     except:
432         try:
433 	    ret = c.execute(
434 """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
435                     (relevance, name, id))
436         except:
437 	    print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
438 	    print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
439 	    print sys.exc_type, sys.exc_value
440 	    return -1
441 
442     return ret
443 
444 #########################################################################
445 #									#
446 #                  Word dictionary and analysis routines		#
447 #									#
448 #########################################################################
449 
450 #
451 # top 100 english word without the one len < 3 + own set
452 #
453 dropWords = {
454     'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
455     'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
456     'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
457     'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
458     'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
459     'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
460     'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
461     'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
462     'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
463     'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
464     'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
465     'down':0,
466     'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
467 }
468 
469 wordsDict = {}
470 wordsDictHTML = {}
471 wordsDictArchive = {}
472 
473 def cleanupWordsString(str):
474     str = string.replace(str, ".", " ")
475     str = string.replace(str, "!", " ")
476     str = string.replace(str, "?", " ")
477     str = string.replace(str, ",", " ")
478     str = string.replace(str, "'", " ")
479     str = string.replace(str, '"', " ")
480     str = string.replace(str, ";", " ")
481     str = string.replace(str, "(", " ")
482     str = string.replace(str, ")", " ")
483     str = string.replace(str, "{", " ")
484     str = string.replace(str, "}", " ")
485     str = string.replace(str, "<", " ")
486     str = string.replace(str, ">", " ")
487     str = string.replace(str, "=", " ")
488     str = string.replace(str, "/", " ")
489     str = string.replace(str, "*", " ")
490     str = string.replace(str, ":", " ")
491     str = string.replace(str, "#", " ")
492     str = string.replace(str, "\\", " ")
493     str = string.replace(str, "\n", " ")
494     str = string.replace(str, "\r", " ")
495     str = string.replace(str, "\xc2", " ")
496     str = string.replace(str, "\xa0", " ")
497     return str
498 
499 def cleanupDescrString(str):
500     str = string.replace(str, "'", " ")
501     str = string.replace(str, "\n", " ")
502     str = string.replace(str, "\r", " ")
503     str = string.replace(str, "\xc2", " ")
504     str = string.replace(str, "\xa0", " ")
505     l = string.split(str)
506     str = string.join(str)
507     return str
508 
509 def splitIdentifier(str):
510     ret = []
511     while str != "":
512         cur = string.lower(str[0])
513 	str = str[1:]
514 	if ((cur < 'a') or (cur > 'z')):
515 	    continue
516 	while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
517 	    cur = cur + string.lower(str[0])
518 	    str = str[1:]
519 	while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
520 	    cur = cur + str[0]
521 	    str = str[1:]
522 	while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
523 	    str = str[1:]
524 	ret.append(cur)
525     return ret
526 
527 def addWord(word, module, symbol, relevance):
528     global wordsDict
529 
530     if word == None or len(word) < 3:
531         return -1
532     if module == None or symbol == None:
533         return -1
534     if dropWords.has_key(word):
535         return 0
536     if ord(word[0]) > 0x80:
537         return 0
538 
539     if wordsDict.has_key(word):
540         d = wordsDict[word]
541 	if d == None:
542 	    return 0
543 	if len(d) > 500:
544 	    wordsDict[word] = None
545 	    return 0
546 	try:
547 	    relevance = relevance + d[(module, symbol)]
548 	except:
549 	    pass
550     else:
551         wordsDict[word] = {}
552     wordsDict[word][(module, symbol)] = relevance
553     return relevance
554 
555 def addString(str, module, symbol, relevance):
556     if str == None or len(str) < 3:
557         return -1
558     ret = 0
559     str = cleanupWordsString(str)
560     l = string.split(str)
561     for word in l:
562 	if len(word) > 2:
563 	    ret = ret + addWord(word, module, symbol, 5)
564 
565     return ret
566 
567 def addWordHTML(word, resource, id, section, relevance):
568     global wordsDictHTML
569 
570     if word == None or len(word) < 3:
571         return -1
572     if resource == None or section == None:
573         return -1
574     if dropWords.has_key(word):
575         return 0
576     if ord(word[0]) > 0x80:
577         return 0
578 
579     section = cleanupDescrString(section)
580 
581     if wordsDictHTML.has_key(word):
582         d = wordsDictHTML[word]
583 	if d == None:
584 	    print "skipped %s" % (word)
585 	    return 0
586 	try:
587 	    (r,i,s) = d[resource]
588 	    if i != None:
589 	        id = i
590 	    if s != None:
591 	        section = s
592 	    relevance = relevance + r
593 	except:
594 	    pass
595     else:
596         wordsDictHTML[word] = {}
597     d = wordsDictHTML[word];
598     d[resource] = (relevance, id, section)
599     return relevance
600 
601 def addStringHTML(str, resource, id, section, relevance):
602     if str == None or len(str) < 3:
603         return -1
604     ret = 0
605     str = cleanupWordsString(str)
606     l = string.split(str)
607     for word in l:
608 	if len(word) > 2:
609 	    try:
610 		r = addWordHTML(word, resource, id, section, relevance)
611 		if r < 0:
612 		    print "addWordHTML failed: %s %s" % (word, resource)
613 		ret = ret + r
614 	    except:
615 		print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
616 		print sys.exc_type, sys.exc_value
617 
618     return ret
619 
620 def addWordArchive(word, id, relevance):
621     global wordsDictArchive
622 
623     if word == None or len(word) < 3:
624         return -1
625     if id == None or id == -1:
626         return -1
627     if dropWords.has_key(word):
628         return 0
629     if ord(word[0]) > 0x80:
630         return 0
631 
632     if wordsDictArchive.has_key(word):
633         d = wordsDictArchive[word]
634 	if d == None:
635 	    print "skipped %s" % (word)
636 	    return 0
637 	try:
638 	    r = d[id]
639 	    relevance = relevance + r
640 	except:
641 	    pass
642     else:
643         wordsDictArchive[word] = {}
644     d = wordsDictArchive[word];
645     d[id] = relevance
646     return relevance
647 
648 def addStringArchive(str, id, relevance):
649     if str == None or len(str) < 3:
650         return -1
651     ret = 0
652     str = cleanupWordsString(str)
653     l = string.split(str)
654     for word in l:
655         i = len(word)
656 	if i > 2:
657 	    try:
658 		r = addWordArchive(word, id, relevance)
659 		if r < 0:
660 		    print "addWordArchive failed: %s %s" % (word, id)
661 		else:
662 		    ret = ret + r
663 	    except:
664 		print "addWordArchive failed: %s %s %d" % (word, id, relevance)
665 		print sys.exc_type, sys.exc_value
666     return ret
667 
668 #########################################################################
669 #									#
670 #                  XML API description analysis				#
671 #									#
672 #########################################################################
673 
674 def loadAPI(filename):
675     doc = libxml2.parseFile(filename)
676     print "loaded %s" % (filename)
677     return doc
678 
679 def foundExport(file, symbol):
680     if file == None:
681         return 0
682     if symbol == None:
683         return 0
684     addFunction(symbol, file)
685     l = splitIdentifier(symbol)
686     for word in l:
687 	addWord(word, file, symbol, 10)
688     return 1
689 
690 def analyzeAPIFile(top):
691     count = 0
692     name = top.prop("name")
693     cur = top.children
694     while cur != None:
695         if cur.type == 'text':
696 	    cur = cur.next
697 	    continue
698 	if cur.name == "exports":
699 	    count = count + foundExport(name, cur.prop("symbol"))
700 	else:
701 	    print "unexpected element %s in API doc <file name='%s'>" % (name)
702         cur = cur.next
703     return count
704 
705 def analyzeAPIFiles(top):
706     count = 0
707     cur = top.children
708 
709     while cur != None:
710         if cur.type == 'text':
711 	    cur = cur.next
712 	    continue
713 	if cur.name == "file":
714 	    count = count + analyzeAPIFile(cur)
715 	else:
716 	    print "unexpected element %s in API doc <files>" % (cur.name)
717         cur = cur.next
718     return count
719 
720 def analyzeAPIEnum(top):
721     file = top.prop("file")
722     if file == None:
723         return 0
724     symbol = top.prop("name")
725     if symbol == None:
726         return 0
727 
728     addEnum(symbol, file)
729     l = splitIdentifier(symbol)
730     for word in l:
731 	addWord(word, file, symbol, 10)
732 
733     return 1
734 
735 def analyzeAPIConst(top):
736     file = top.prop("file")
737     if file == None:
738         return 0
739     symbol = top.prop("name")
740     if symbol == None:
741         return 0
742 
743     addConst(symbol, file)
744     l = splitIdentifier(symbol)
745     for word in l:
746 	addWord(word, file, symbol, 10)
747 
748     return 1
749 
750 def analyzeAPIType(top):
751     file = top.prop("file")
752     if file == None:
753         return 0
754     symbol = top.prop("name")
755     if symbol == None:
756         return 0
757 
758     addType(symbol, file)
759     l = splitIdentifier(symbol)
760     for word in l:
761 	addWord(word, file, symbol, 10)
762     return 1
763 
764 def analyzeAPIFunctype(top):
765     file = top.prop("file")
766     if file == None:
767         return 0
768     symbol = top.prop("name")
769     if symbol == None:
770         return 0
771 
772     addFunctype(symbol, file)
773     l = splitIdentifier(symbol)
774     for word in l:
775 	addWord(word, file, symbol, 10)
776     return 1
777 
778 def analyzeAPIStruct(top):
779     file = top.prop("file")
780     if file == None:
781         return 0
782     symbol = top.prop("name")
783     if symbol == None:
784         return 0
785 
786     addStruct(symbol, file)
787     l = splitIdentifier(symbol)
788     for word in l:
789 	addWord(word, file, symbol, 10)
790 
791     info = top.prop("info")
792     if info != None:
793 	info = string.replace(info, "'", " ")
794 	info = string.strip(info)
795 	l = string.split(info)
796 	for word in l:
797 	    if len(word) > 2:
798 		addWord(word, file, symbol, 5)
799     return 1
800 
801 def analyzeAPIMacro(top):
802     file = top.prop("file")
803     if file == None:
804         return 0
805     symbol = top.prop("name")
806     if symbol == None:
807         return 0
808     symbol = string.replace(symbol, "'", " ")
809     symbol = string.strip(symbol)
810 
811     info = None
812     cur = top.children
813     while cur != None:
814         if cur.type == 'text':
815 	    cur = cur.next
816 	    continue
817 	if cur.name == "info":
818 	    info = cur.content
819 	    break
820         cur = cur.next
821 
822     l = splitIdentifier(symbol)
823     for word in l:
824 	addWord(word, file, symbol, 10)
825 
826     if info == None:
827 	addMacro(symbol, file)
828         print "Macro %s description has no <info>" % (symbol)
829         return 0
830 
831     info = string.replace(info, "'", " ")
832     info = string.strip(info)
833     addMacro(symbol, file, info)
834     l = string.split(info)
835     for word in l:
836 	if len(word) > 2:
837 	    addWord(word, file, symbol, 5)
838     return 1
839 
840 def analyzeAPIFunction(top):
841     file = top.prop("file")
842     if file == None:
843         return 0
844     symbol = top.prop("name")
845     if symbol == None:
846         return 0
847 
848     symbol = string.replace(symbol, "'", " ")
849     symbol = string.strip(symbol)
850     info = None
851     cur = top.children
852     while cur != None:
853         if cur.type == 'text':
854 	    cur = cur.next
855 	    continue
856 	if cur.name == "info":
857 	    info = cur.content
858 	elif cur.name == "return":
859 	    rinfo = cur.prop("info")
860 	    if rinfo != None:
861 		rinfo = string.replace(rinfo, "'", " ")
862 		rinfo = string.strip(rinfo)
863 	        addString(rinfo, file, symbol, 7)
864 	elif cur.name == "arg":
865 	    ainfo = cur.prop("info")
866 	    if ainfo != None:
867 		ainfo = string.replace(ainfo, "'", " ")
868 		ainfo = string.strip(ainfo)
869 	        addString(ainfo, file, symbol, 5)
870 	    name = cur.prop("name")
871 	    if name != None:
872 		name = string.replace(name, "'", " ")
873 		name = string.strip(name)
874 	        addWord(name, file, symbol, 7)
875         cur = cur.next
876     if info == None:
877         print "Function %s description has no <info>" % (symbol)
878 	addFunction(symbol, file, "")
879     else:
880         info = string.replace(info, "'", " ")
881 	info = string.strip(info)
882 	addFunction(symbol, file, info)
883         addString(info, file, symbol, 5)
884 
885     l = splitIdentifier(symbol)
886     for word in l:
887 	addWord(word, file, symbol, 10)
888 
889     return 1
890 
891 def analyzeAPISymbols(top):
892     count = 0
893     cur = top.children
894 
895     while cur != None:
896         if cur.type == 'text':
897 	    cur = cur.next
898 	    continue
899 	if cur.name == "macro":
900 	    count = count + analyzeAPIMacro(cur)
901 	elif cur.name == "function":
902 	    count = count + analyzeAPIFunction(cur)
903 	elif cur.name == "const":
904 	    count = count + analyzeAPIConst(cur)
905 	elif cur.name == "typedef":
906 	    count = count + analyzeAPIType(cur)
907 	elif cur.name == "struct":
908 	    count = count + analyzeAPIStruct(cur)
909 	elif cur.name == "enum":
910 	    count = count + analyzeAPIEnum(cur)
911 	elif cur.name == "functype":
912 	    count = count + analyzeAPIFunctype(cur)
913 	else:
914 	    print "unexpected element %s in API doc <files>" % (cur.name)
915         cur = cur.next
916     return count
917 
918 def analyzeAPI(doc):
919     count = 0
920     if doc == None:
921         return -1
922     root = doc.getRootElement()
923     if root.name != "api":
924         print "Unexpected root name"
925         return -1
926     cur = root.children
927     while cur != None:
928         if cur.type == 'text':
929 	    cur = cur.next
930 	    continue
931 	if cur.name == "files":
932 	    pass
933 #	    count = count + analyzeAPIFiles(cur)
934 	elif cur.name == "symbols":
935 	    count = count + analyzeAPISymbols(cur)
936 	else:
937 	    print "unexpected element %s in API doc" % (cur.name)
938         cur = cur.next
939     return count
940 
941 #########################################################################
942 #									#
943 #                  Web pages parsing and analysis			#
944 #									#
945 #########################################################################
946 
947 import glob
948 
949 def analyzeHTMLText(doc, resource, p, section, id):
950     words = 0
951     try:
952 	content = p.content
953 	words = words + addStringHTML(content, resource, id, section, 5)
954     except:
955         return -1
956     return words
957 
958 def analyzeHTMLPara(doc, resource, p, section, id):
959     words = 0
960     try:
961 	content = p.content
962 	words = words + addStringHTML(content, resource, id, section, 5)
963     except:
964         return -1
965     return words
966 
967 def analyzeHTMLPre(doc, resource, p, section, id):
968     words = 0
969     try:
970 	content = p.content
971 	words = words + addStringHTML(content, resource, id, section, 5)
972     except:
973         return -1
974     return words
975 
976 def analyzeHTML(doc, resource, p, section, id):
977     words = 0
978     try:
979 	content = p.content
980 	words = words + addStringHTML(content, resource, id, section, 5)
981     except:
982         return -1
983     return words
984 
985 def analyzeHTML(doc, resource):
986     para = 0;
987     ctxt = doc.xpathNewContext()
988     try:
989 	res = ctxt.xpathEval("//head/title")
990 	title = res[0].content
991     except:
992         title = "Page %s" % (resource)
993     addPage(resource, title)
994     try:
995 	items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
996 	section = title
997 	id = ""
998 	for item in items:
999 	    if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
1000 	        section = item.content
1001 		if item.prop("id"):
1002 		    id = item.prop("id")
1003 		elif item.prop("name"):
1004 		    id = item.prop("name")
1005 	    elif item.type == 'text':
1006 	        analyzeHTMLText(doc, resource, item, section, id)
1007 		para = para + 1
1008 	    elif item.name == 'p':
1009 	        analyzeHTMLPara(doc, resource, item, section, id)
1010 		para = para + 1
1011 	    elif item.name == 'pre':
1012 	        analyzeHTMLPre(doc, resource, item, section, id)
1013 		para = para + 1
1014 	    else:
1015 	        print "Page %s, unexpected %s element" % (resource, item.name)
1016     except:
1017         print "Page %s: problem analyzing" % (resource)
1018 	print sys.exc_type, sys.exc_value
1019 
1020     return para
1021 
1022 def analyzeHTMLPages():
1023     ret = 0
1024     HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
1025     for html in HTMLfiles:
1026 	if html[0:3] == "API":
1027 	    continue
1028 	if html == "xml.html":
1029 	    continue
1030 	try:
1031 	    doc = libxml2.parseFile(html)
1032 	except:
1033 	    doc = libxml2.htmlParseFile(html, None)
1034 	try:
1035 	    res = analyzeHTML(doc, html)
1036 	    print "Parsed %s : %d paragraphs" % (html, res)
1037 	    ret = ret + 1
1038 	except:
1039 	    print "could not parse %s" % (html)
1040     return ret
1041 
1042 #########################################################################
1043 #									#
1044 #                  Mail archives parsing and analysis			#
1045 #									#
1046 #########################################################################
1047 
1048 import time
1049 
1050 def getXMLDateArchive(t = None):
1051     if t == None:
1052 	t = time.time()
1053     T = time.gmtime(t)
1054     month = time.strftime("%B", T)
1055     year = T[0]
1056     url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
1057     return url
1058 
1059 def scanXMLMsgArchive(url, title, force = 0):
1060     if url == None or title == None:
1061         return 0
1062 
1063     ID = checkXMLMsgArchive(url)
1064     if force == 0 and ID != -1:
1065         return 0
1066 
1067     if ID == -1:
1068 	ID = addXMLMsgArchive(url, title)
1069 	if ID == -1:
1070 	    return 0
1071 
1072     try:
1073         print "Loading %s" % (url)
1074         doc = libxml2.htmlParseFile(url, None);
1075     except:
1076         doc = None
1077     if doc == None:
1078         print "Failed to parse %s" % (url)
1079 	return 0
1080 
1081     addStringArchive(title, ID, 20)
1082     ctxt = doc.xpathNewContext()
1083     texts = ctxt.xpathEval("//pre//text()")
1084     for text in texts:
1085         addStringArchive(text.content, ID, 5)
1086 
1087     return 1
1088 
1089 def scanXMLDateArchive(t = None, force = 0):
1090     global wordsDictArchive
1091 
1092     wordsDictArchive = {}
1093 
1094     url = getXMLDateArchive(t)
1095     print "loading %s" % (url)
1096     try:
1097 	doc = libxml2.htmlParseFile(url, None);
1098     except:
1099         doc = None
1100     if doc == None:
1101         print "Failed to parse %s" % (url)
1102 	return -1
1103     ctxt = doc.xpathNewContext()
1104     anchors = ctxt.xpathEval("//a[@href]")
1105     links = 0
1106     newmsg = 0
1107     for anchor in anchors:
1108 	href = anchor.prop("href")
1109 	if href == None or href[0:3] != "msg":
1110 	    continue
1111         try:
1112 	    links = links + 1
1113 
1114 	    msg = libxml2.buildURI(href, url)
1115 	    title = anchor.content
1116 	    if title != None and title[0:4] == 'Re: ':
1117 	        title = title[4:]
1118 	    if title != None and title[0:6] == '[xml] ':
1119 	        title = title[6:]
1120 	    newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
1121 
1122 	except:
1123 	    pass
1124 
1125     return newmsg
1126 
1127 
1128 #########################################################################
1129 #									#
1130 #          Main code: open the DB, the API XML and analyze it		#
1131 #									#
1132 #########################################################################
1133 def analyzeArchives(t = None, force = 0):
1134     global wordsDictArchive
1135 
1136     ret = scanXMLDateArchive(t, force)
1137     print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
1138 
1139     i = 0
1140     skipped = 0
1141     for word in wordsDictArchive.keys():
1142 	refs = wordsDictArchive[word]
1143 	if refs  == None:
1144 	    skipped = skipped + 1
1145 	    continue;
1146 	for id in refs.keys():
1147 	    relevance = refs[id]
1148 	    updateWordArchive(word, id, relevance)
1149 	    i = i + 1
1150 
1151     print "Found %d associations in HTML pages" % (i)
1152 
1153 def analyzeHTMLTop():
1154     global wordsDictHTML
1155 
1156     ret = analyzeHTMLPages()
1157     print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
1158 
1159     i = 0
1160     skipped = 0
1161     for word in wordsDictHTML.keys():
1162 	refs = wordsDictHTML[word]
1163 	if refs  == None:
1164 	    skipped = skipped + 1
1165 	    continue;
1166 	for resource in refs.keys():
1167 	    (relevance, id, section) = refs[resource]
1168 	    updateWordHTML(word, resource, section, id, relevance)
1169 	    i = i + 1
1170 
1171     print "Found %d associations in HTML pages" % (i)
1172 
1173 def analyzeAPITop():
1174     global wordsDict
1175     global API
1176 
1177     try:
1178 	doc = loadAPI(API)
1179 	ret = analyzeAPI(doc)
1180 	print "Analyzed %d blocks" % (ret)
1181 	doc.freeDoc()
1182     except:
1183 	print "Failed to parse and analyze %s" % (API)
1184 	print sys.exc_type, sys.exc_value
1185 	sys.exit(1)
1186 
1187     print "Indexed %d words" % (len(wordsDict))
1188     i = 0
1189     skipped = 0
1190     for word in wordsDict.keys():
1191 	refs = wordsDict[word]
1192 	if refs  == None:
1193 	    skipped = skipped + 1
1194 	    continue;
1195 	for (module, symbol) in refs.keys():
1196 	    updateWord(word, symbol, refs[(module, symbol)])
1197 	    i = i + 1
1198 
1199     print "Found %d associations, skipped %d words" % (i, skipped)
1200 
1201 def usage():
1202     print "Usage index.py [--force] [--archive]  [--archive-year year] [--archive-month month] [--API] [--docs]"
1203     sys.exit(1)
1204 
1205 def main():
1206     try:
1207 	openMySQL()
1208     except:
1209 	print "Failed to open the database"
1210 	print sys.exc_type, sys.exc_value
1211 	sys.exit(1)
1212 
1213     args = sys.argv[1:]
1214     force = 0
1215     if args:
1216         i = 0
1217 	while i < len(args):
1218 	    if args[i] == '--force':
1219 	        force = 1
1220 	    elif args[i] == '--archive':
1221 	        analyzeArchives(None, force)
1222 	    elif args[i] == '--archive-year':
1223 	        i = i + 1;
1224 		year = args[i]
1225 		months = ["January" , "February", "March", "April", "May",
1226 			  "June", "July", "August", "September", "October",
1227 			  "November", "December"];
1228 	        for month in months:
1229 		    try:
1230 		        str = "%s-%s" % (year, month)
1231 			T = time.strptime(str, "%Y-%B")
1232 			t = time.mktime(T) + 3600 * 24 * 10;
1233 			analyzeArchives(t, force)
1234 		    except:
1235 			print "Failed to index month archive:"
1236 			print sys.exc_type, sys.exc_value
1237 	    elif args[i] == '--archive-month':
1238 	        i = i + 1;
1239 		month = args[i]
1240 		try:
1241 		    T = time.strptime(month, "%Y-%B")
1242 		    t = time.mktime(T) + 3600 * 24 * 10;
1243 		    analyzeArchives(t, force)
1244 		except:
1245 		    print "Failed to index month archive:"
1246 		    print sys.exc_type, sys.exc_value
1247 	    elif args[i] == '--API':
1248 	        analyzeAPITop()
1249 	    elif args[i] == '--docs':
1250 	        analyzeHTMLTop()
1251 	    else:
1252 	        usage()
1253 	    i = i + 1
1254     else:
1255         usage()
1256 
1257 if __name__ == "__main__":
1258     main()
1259