1#!/usr/bin/python -u 2# 3# imports the API description and fills up a database with 4# name relevance to modules, functions or web pages 5# 6# Operation needed: 7# ================= 8# 9# install mysqld, the python wrappers for mysql and libxml2, start mysqld 10# Change the root passwd of mysql: 11# mysqladmin -u root password new_password 12# Create the new database xmlsoft 13# mysqladmin -p create xmlsoft 14# Create a database user 'veillard' and give him passord access 15# change veillard and abcde with the right user name and passwd 16# mysql -p 17# password: 18# mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost 19# IDENTIFIED BY 'abcde' WITH GRANT OPTION; 20# 21# As the user check the access: 22# mysql -p xmlsoft 23# Enter password: 24# Welcome to the MySQL monitor.... 25# mysql> use xmlsoft 26# Database changed 27# mysql> quit 28# Bye 29# 30# Then run the script in the doc subdir, it will create the symbols and 31# word tables and populate them with informations extracted from 32# the libxml2-api.xml API description, and make then accessible read-only 33# by nobody@loaclhost the user expected to be Apache's one 34# 35# On the Apache configuration, make sure you have php support enabled 36# 37 38import MySQLdb 39import libxml2 40import sys 41import string 42import os 43 44# 45# We are not interested in parsing errors here 46# 47def callback(ctx, str): 48 return 49libxml2.registerErrorHandler(callback, None) 50 51# 52# The dictionnary of tables required and the SQL command needed 53# to create them 54# 55TABLES={ 56 "symbols" : """CREATE TABLE symbols ( 57 name varchar(255) BINARY NOT NULL, 58 module varchar(255) BINARY NOT NULL, 59 type varchar(25) NOT NULL, 60 descr varchar(255), 61 UNIQUE KEY name (name), 62 KEY module (module))""", 63 "words" : """CREATE TABLE words ( 64 name varchar(50) BINARY NOT NULL, 65 symbol varchar(255) BINARY NOT NULL, 66 relevance int, 67 KEY name (name), 68 KEY symbol (symbol), 69 UNIQUE KEY ID (name, symbol))""", 70 "wordsHTML" : """CREATE TABLE wordsHTML ( 71 name varchar(50) BINARY NOT NULL, 72 resource varchar(255) BINARY NOT NULL, 73 section varchar(255), 74 id varchar(50), 75 relevance int, 76 KEY name (name), 77 KEY resource (resource), 78 UNIQUE KEY ref (name, resource))""", 79 "wordsArchive" : """CREATE TABLE wordsArchive ( 80 name varchar(50) BINARY NOT NULL, 81 ID int(11) NOT NULL, 82 relevance int, 83 KEY name (name), 84 UNIQUE KEY ref (name, ID))""", 85 "pages" : """CREATE TABLE pages ( 86 resource varchar(255) BINARY NOT NULL, 87 title varchar(255) BINARY NOT NULL, 88 UNIQUE KEY name (resource))""", 89 "archives" : """CREATE TABLE archives ( 90 ID int(11) NOT NULL auto_increment, 91 resource varchar(255) BINARY NOT NULL, 92 title varchar(255) BINARY NOT NULL, 93 UNIQUE KEY id (ID,resource(255)), 94 INDEX (ID), 95 INDEX (resource))""", 96 "Queries" : """CREATE TABLE Queries ( 97 ID int(11) NOT NULL auto_increment, 98 Value varchar(50) NOT NULL, 99 Count int(11) NOT NULL, 100 UNIQUE KEY id (ID,Value(35)), 101 INDEX (ID))""", 102 "AllQueries" : """CREATE TABLE AllQueries ( 103 ID int(11) NOT NULL auto_increment, 104 Value varchar(50) NOT NULL, 105 Count int(11) NOT NULL, 106 UNIQUE KEY id (ID,Value(35)), 107 INDEX (ID))""", 108} 109 110# 111# The XML API description file to parse 112# 113API="libxml2-api.xml" 114DB=None 115 116######################################################################### 117# # 118# MySQL database interfaces # 119# # 120######################################################################### 121def createTable(db, name): 122 global TABLES 123 124 if db == None: 125 return -1 126 if name == None: 127 return -1 128 c = db.cursor() 129 130 ret = c.execute("DROP TABLE IF EXISTS %s" % (name)) 131 if ret == 1: 132 print "Removed table %s" % (name) 133 print "Creating table %s" % (name) 134 try: 135 ret = c.execute(TABLES[name]) 136 except: 137 print "Failed to create table %s" % (name) 138 return -1 139 return ret 140 141def checkTables(db, verbose = 1): 142 global TABLES 143 144 if db == None: 145 return -1 146 c = db.cursor() 147 nbtables = c.execute("show tables") 148 if verbose: 149 print "Found %d tables" % (nbtables) 150 tables = {} 151 i = 0 152 while i < nbtables: 153 l = c.fetchone() 154 name = l[0] 155 tables[name] = {} 156 i = i + 1 157 158 for table in TABLES.keys(): 159 if not tables.has_key(table): 160 print "table %s missing" % (table) 161 createTable(db, table) 162 try: 163 ret = c.execute("SELECT count(*) from %s" % table); 164 row = c.fetchone() 165 if verbose: 166 print "Table %s contains %d records" % (table, row[0]) 167 except: 168 print "Troubles with table %s : repairing" % (table) 169 ret = c.execute("repair table %s" % table); 170 print "repairing returned %d" % (ret) 171 ret = c.execute("SELECT count(*) from %s" % table); 172 row = c.fetchone() 173 print "Table %s contains %d records" % (table, row[0]) 174 if verbose: 175 print "checkTables finished" 176 177 # make sure apache can access the tables read-only 178 try: 179 ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost") 180 ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost") 181 except: 182 pass 183 return 0 184 185def openMySQL(db="xmlsoft", passwd=None, verbose = 1): 186 global DB 187 188 if passwd == None: 189 try: 190 passwd = os.environ["MySQL_PASS"] 191 except: 192 print "No password available, set environment MySQL_PASS" 193 sys.exit(1) 194 195 DB = MySQLdb.connect(passwd=passwd, db=db) 196 if DB == None: 197 return -1 198 ret = checkTables(DB, verbose) 199 return ret 200 201def updateWord(name, symbol, relevance): 202 global DB 203 204 if DB == None: 205 openMySQL() 206 if DB == None: 207 return -1 208 if name == None: 209 return -1 210 if symbol == None: 211 return -1 212 213 c = DB.cursor() 214 try: 215 ret = c.execute( 216"""INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" % 217 (name, symbol, relevance)) 218 except: 219 try: 220 ret = c.execute( 221 """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" % 222 (relevance, name, symbol)) 223 except: 224 print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance) 225 print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol) 226 print sys.exc_type, sys.exc_value 227 return -1 228 229 return ret 230 231def updateSymbol(name, module, type, desc): 232 global DB 233 234 updateWord(name, name, 50) 235 if DB == None: 236 openMySQL() 237 if DB == None: 238 return -1 239 if name == None: 240 return -1 241 if module == None: 242 return -1 243 if type == None: 244 return -1 245 246 try: 247 desc = string.replace(desc, "'", " ") 248 l = string.split(desc, ".") 249 desc = l[0] 250 desc = desc[0:99] 251 except: 252 desc = "" 253 254 c = DB.cursor() 255 try: 256 ret = c.execute( 257"""INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" % 258 (name, module, type, desc)) 259 except: 260 try: 261 ret = c.execute( 262"""UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % 263 (module, type, desc, name)) 264 except: 265 print "Update symbol (%s, %s, %s) failed command" % (name, module, type) 266 print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name) 267 print sys.exc_type, sys.exc_value 268 return -1 269 270 return ret 271 272def addFunction(name, module, desc = ""): 273 return updateSymbol(name, module, 'function', desc) 274 275def addMacro(name, module, desc = ""): 276 return updateSymbol(name, module, 'macro', desc) 277 278def addEnum(name, module, desc = ""): 279 return updateSymbol(name, module, 'enum', desc) 280 281def addStruct(name, module, desc = ""): 282 return updateSymbol(name, module, 'struct', desc) 283 284def addConst(name, module, desc = ""): 285 return updateSymbol(name, module, 'const', desc) 286 287def addType(name, module, desc = ""): 288 return updateSymbol(name, module, 'type', desc) 289 290def addFunctype(name, module, desc = ""): 291 return updateSymbol(name, module, 'functype', desc) 292 293def addPage(resource, title): 294 global DB 295 296 if DB == None: 297 openMySQL() 298 if DB == None: 299 return -1 300 if resource == None: 301 return -1 302 303 c = DB.cursor() 304 try: 305 ret = c.execute( 306 """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" % 307 (resource, title)) 308 except: 309 try: 310 ret = c.execute( 311 """UPDATE pages SET title='%s' WHERE resource='%s'""" % 312 (title, resource)) 313 except: 314 print "Update symbol (%s, %s, %s) failed command" % (name, module, type) 315 print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource) 316 print sys.exc_type, sys.exc_value 317 return -1 318 319 return ret 320 321def updateWordHTML(name, resource, desc, id, relevance): 322 global DB 323 324 if DB == None: 325 openMySQL() 326 if DB == None: 327 return -1 328 if name == None: 329 return -1 330 if resource == None: 331 return -1 332 if id == None: 333 id = "" 334 if desc == None: 335 desc = "" 336 else: 337 try: 338 desc = string.replace(desc, "'", " ") 339 desc = desc[0:99] 340 except: 341 desc = "" 342 343 c = DB.cursor() 344 try: 345 ret = c.execute( 346"""INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" % 347 (name, resource, desc, id, relevance)) 348 except: 349 try: 350 ret = c.execute( 351"""UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % 352 (desc, id, relevance, name, resource)) 353 except: 354 print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance) 355 print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource) 356 print sys.exc_type, sys.exc_value 357 return -1 358 359 return ret 360 361def checkXMLMsgArchive(url): 362 global DB 363 364 if DB == None: 365 openMySQL() 366 if DB == None: 367 return -1 368 if url == None: 369 return -1 370 371 c = DB.cursor() 372 try: 373 ret = c.execute( 374 """SELECT ID FROM archives WHERE resource='%s'""" % (url)) 375 row = c.fetchone() 376 if row == None: 377 return -1 378 except: 379 return -1 380 381 return row[0] 382 383def addXMLMsgArchive(url, title): 384 global DB 385 386 if DB == None: 387 openMySQL() 388 if DB == None: 389 return -1 390 if url == None: 391 return -1 392 if title == None: 393 title = "" 394 else: 395 title = string.replace(title, "'", " ") 396 title = title[0:99] 397 398 c = DB.cursor() 399 try: 400 cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title) 401 ret = c.execute(cmd) 402 cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url) 403 ret = c.execute(cmd) 404 row = c.fetchone() 405 if row == None: 406 print "addXMLMsgArchive failed to get the ID: %s" % (url) 407 return -1 408 except: 409 print "addXMLMsgArchive failed command: %s" % (cmd) 410 return -1 411 412 return((int)(row[0])) 413 414def updateWordArchive(name, id, relevance): 415 global DB 416 417 if DB == None: 418 openMySQL() 419 if DB == None: 420 return -1 421 if name == None: 422 return -1 423 if id == None: 424 return -1 425 426 c = DB.cursor() 427 try: 428 ret = c.execute( 429"""INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" % 430 (name, id, relevance)) 431 except: 432 try: 433 ret = c.execute( 434"""UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % 435 (relevance, name, id)) 436 except: 437 print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance) 438 print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id) 439 print sys.exc_type, sys.exc_value 440 return -1 441 442 return ret 443 444######################################################################### 445# # 446# Word dictionnary and analysis routines # 447# # 448######################################################################### 449 450# 451# top 100 english word without the one len < 3 + own set 452# 453dropWords = { 454 'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0, 455 'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0, 456 'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0, 457 'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0, 458 'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0, 459 'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0, 460 'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0, 461 'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0, 462 'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0, 463 'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0, 464 'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0, 465 'down':0, 466 'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0, 467} 468 469wordsDict = {} 470wordsDictHTML = {} 471wordsDictArchive = {} 472 473def cleanupWordsString(str): 474 str = string.replace(str, ".", " ") 475 str = string.replace(str, "!", " ") 476 str = string.replace(str, "?", " ") 477 str = string.replace(str, ",", " ") 478 str = string.replace(str, "'", " ") 479 str = string.replace(str, '"', " ") 480 str = string.replace(str, ";", " ") 481 str = string.replace(str, "(", " ") 482 str = string.replace(str, ")", " ") 483 str = string.replace(str, "{", " ") 484 str = string.replace(str, "}", " ") 485 str = string.replace(str, "<", " ") 486 str = string.replace(str, ">", " ") 487 str = string.replace(str, "=", " ") 488 str = string.replace(str, "/", " ") 489 str = string.replace(str, "*", " ") 490 str = string.replace(str, ":", " ") 491 str = string.replace(str, "#", " ") 492 str = string.replace(str, "\\", " ") 493 str = string.replace(str, "\n", " ") 494 str = string.replace(str, "\r", " ") 495 str = string.replace(str, "\xc2", " ") 496 str = string.replace(str, "\xa0", " ") 497 return str 498 499def cleanupDescrString(str): 500 str = string.replace(str, "'", " ") 501 str = string.replace(str, "\n", " ") 502 str = string.replace(str, "\r", " ") 503 str = string.replace(str, "\xc2", " ") 504 str = string.replace(str, "\xa0", " ") 505 l = string.split(str) 506 str = string.join(str) 507 return str 508 509def splitIdentifier(str): 510 ret = [] 511 while str != "": 512 cur = string.lower(str[0]) 513 str = str[1:] 514 if ((cur < 'a') or (cur > 'z')): 515 continue 516 while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'): 517 cur = cur + string.lower(str[0]) 518 str = str[1:] 519 while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'): 520 cur = cur + str[0] 521 str = str[1:] 522 while (str != "") and (str[0] >= '0') and (str[0] <= '9'): 523 str = str[1:] 524 ret.append(cur) 525 return ret 526 527def addWord(word, module, symbol, relevance): 528 global wordsDict 529 530 if word == None or len(word) < 3: 531 return -1 532 if module == None or symbol == None: 533 return -1 534 if dropWords.has_key(word): 535 return 0 536 if ord(word[0]) > 0x80: 537 return 0 538 539 if wordsDict.has_key(word): 540 d = wordsDict[word] 541 if d == None: 542 return 0 543 if len(d) > 500: 544 wordsDict[word] = None 545 return 0 546 try: 547 relevance = relevance + d[(module, symbol)] 548 except: 549 pass 550 else: 551 wordsDict[word] = {} 552 wordsDict[word][(module, symbol)] = relevance 553 return relevance 554 555def addString(str, module, symbol, relevance): 556 if str == None or len(str) < 3: 557 return -1 558 ret = 0 559 str = cleanupWordsString(str) 560 l = string.split(str) 561 for word in l: 562 if len(word) > 2: 563 ret = ret + addWord(word, module, symbol, 5) 564 565 return ret 566 567def addWordHTML(word, resource, id, section, relevance): 568 global wordsDictHTML 569 570 if word == None or len(word) < 3: 571 return -1 572 if resource == None or section == None: 573 return -1 574 if dropWords.has_key(word): 575 return 0 576 if ord(word[0]) > 0x80: 577 return 0 578 579 section = cleanupDescrString(section) 580 581 if wordsDictHTML.has_key(word): 582 d = wordsDictHTML[word] 583 if d == None: 584 print "skipped %s" % (word) 585 return 0 586 try: 587 (r,i,s) = d[resource] 588 if i != None: 589 id = i 590 if s != None: 591 section = s 592 relevance = relevance + r 593 except: 594 pass 595 else: 596 wordsDictHTML[word] = {} 597 d = wordsDictHTML[word]; 598 d[resource] = (relevance, id, section) 599 return relevance 600 601def addStringHTML(str, resource, id, section, relevance): 602 if str == None or len(str) < 3: 603 return -1 604 ret = 0 605 str = cleanupWordsString(str) 606 l = string.split(str) 607 for word in l: 608 if len(word) > 2: 609 try: 610 r = addWordHTML(word, resource, id, section, relevance) 611 if r < 0: 612 print "addWordHTML failed: %s %s" % (word, resource) 613 ret = ret + r 614 except: 615 print "addWordHTML failed: %s %s %d" % (word, resource, relevance) 616 print sys.exc_type, sys.exc_value 617 618 return ret 619 620def addWordArchive(word, id, relevance): 621 global wordsDictArchive 622 623 if word == None or len(word) < 3: 624 return -1 625 if id == None or id == -1: 626 return -1 627 if dropWords.has_key(word): 628 return 0 629 if ord(word[0]) > 0x80: 630 return 0 631 632 if wordsDictArchive.has_key(word): 633 d = wordsDictArchive[word] 634 if d == None: 635 print "skipped %s" % (word) 636 return 0 637 try: 638 r = d[id] 639 relevance = relevance + r 640 except: 641 pass 642 else: 643 wordsDictArchive[word] = {} 644 d = wordsDictArchive[word]; 645 d[id] = relevance 646 return relevance 647 648def addStringArchive(str, id, relevance): 649 if str == None or len(str) < 3: 650 return -1 651 ret = 0 652 str = cleanupWordsString(str) 653 l = string.split(str) 654 for word in l: 655 i = len(word) 656 if i > 2: 657 try: 658 r = addWordArchive(word, id, relevance) 659 if r < 0: 660 print "addWordArchive failed: %s %s" % (word, id) 661 else: 662 ret = ret + r 663 except: 664 print "addWordArchive failed: %s %s %d" % (word, id, relevance) 665 print sys.exc_type, sys.exc_value 666 return ret 667 668######################################################################### 669# # 670# XML API description analysis # 671# # 672######################################################################### 673 674def loadAPI(filename): 675 doc = libxml2.parseFile(filename) 676 print "loaded %s" % (filename) 677 return doc 678 679def foundExport(file, symbol): 680 if file == None: 681 return 0 682 if symbol == None: 683 return 0 684 addFunction(symbol, file) 685 l = splitIdentifier(symbol) 686 for word in l: 687 addWord(word, file, symbol, 10) 688 return 1 689 690def analyzeAPIFile(top): 691 count = 0 692 name = top.prop("name") 693 cur = top.children 694 while cur != None: 695 if cur.type == 'text': 696 cur = cur.next 697 continue 698 if cur.name == "exports": 699 count = count + foundExport(name, cur.prop("symbol")) 700 else: 701 print "unexpected element %s in API doc <file name='%s'>" % (name) 702 cur = cur.next 703 return count 704 705def analyzeAPIFiles(top): 706 count = 0 707 cur = top.children 708 709 while cur != None: 710 if cur.type == 'text': 711 cur = cur.next 712 continue 713 if cur.name == "file": 714 count = count + analyzeAPIFile(cur) 715 else: 716 print "unexpected element %s in API doc <files>" % (cur.name) 717 cur = cur.next 718 return count 719 720def analyzeAPIEnum(top): 721 file = top.prop("file") 722 if file == None: 723 return 0 724 symbol = top.prop("name") 725 if symbol == None: 726 return 0 727 728 addEnum(symbol, file) 729 l = splitIdentifier(symbol) 730 for word in l: 731 addWord(word, file, symbol, 10) 732 733 return 1 734 735def analyzeAPIConst(top): 736 file = top.prop("file") 737 if file == None: 738 return 0 739 symbol = top.prop("name") 740 if symbol == None: 741 return 0 742 743 addConst(symbol, file) 744 l = splitIdentifier(symbol) 745 for word in l: 746 addWord(word, file, symbol, 10) 747 748 return 1 749 750def analyzeAPIType(top): 751 file = top.prop("file") 752 if file == None: 753 return 0 754 symbol = top.prop("name") 755 if symbol == None: 756 return 0 757 758 addType(symbol, file) 759 l = splitIdentifier(symbol) 760 for word in l: 761 addWord(word, file, symbol, 10) 762 return 1 763 764def analyzeAPIFunctype(top): 765 file = top.prop("file") 766 if file == None: 767 return 0 768 symbol = top.prop("name") 769 if symbol == None: 770 return 0 771 772 addFunctype(symbol, file) 773 l = splitIdentifier(symbol) 774 for word in l: 775 addWord(word, file, symbol, 10) 776 return 1 777 778def analyzeAPIStruct(top): 779 file = top.prop("file") 780 if file == None: 781 return 0 782 symbol = top.prop("name") 783 if symbol == None: 784 return 0 785 786 addStruct(symbol, file) 787 l = splitIdentifier(symbol) 788 for word in l: 789 addWord(word, file, symbol, 10) 790 791 info = top.prop("info") 792 if info != None: 793 info = string.replace(info, "'", " ") 794 info = string.strip(info) 795 l = string.split(info) 796 for word in l: 797 if len(word) > 2: 798 addWord(word, file, symbol, 5) 799 return 1 800 801def analyzeAPIMacro(top): 802 file = top.prop("file") 803 if file == None: 804 return 0 805 symbol = top.prop("name") 806 if symbol == None: 807 return 0 808 symbol = string.replace(symbol, "'", " ") 809 symbol = string.strip(symbol) 810 811 info = None 812 cur = top.children 813 while cur != None: 814 if cur.type == 'text': 815 cur = cur.next 816 continue 817 if cur.name == "info": 818 info = cur.content 819 break 820 cur = cur.next 821 822 l = splitIdentifier(symbol) 823 for word in l: 824 addWord(word, file, symbol, 10) 825 826 if info == None: 827 addMacro(symbol, file) 828 print "Macro %s description has no <info>" % (symbol) 829 return 0 830 831 info = string.replace(info, "'", " ") 832 info = string.strip(info) 833 addMacro(symbol, file, info) 834 l = string.split(info) 835 for word in l: 836 if len(word) > 2: 837 addWord(word, file, symbol, 5) 838 return 1 839 840def analyzeAPIFunction(top): 841 file = top.prop("file") 842 if file == None: 843 return 0 844 symbol = top.prop("name") 845 if symbol == None: 846 return 0 847 848 symbol = string.replace(symbol, "'", " ") 849 symbol = string.strip(symbol) 850 info = None 851 cur = top.children 852 while cur != None: 853 if cur.type == 'text': 854 cur = cur.next 855 continue 856 if cur.name == "info": 857 info = cur.content 858 elif cur.name == "return": 859 rinfo = cur.prop("info") 860 if rinfo != None: 861 rinfo = string.replace(rinfo, "'", " ") 862 rinfo = string.strip(rinfo) 863 addString(rinfo, file, symbol, 7) 864 elif cur.name == "arg": 865 ainfo = cur.prop("info") 866 if ainfo != None: 867 ainfo = string.replace(ainfo, "'", " ") 868 ainfo = string.strip(ainfo) 869 addString(ainfo, file, symbol, 5) 870 name = cur.prop("name") 871 if name != None: 872 name = string.replace(name, "'", " ") 873 name = string.strip(name) 874 addWord(name, file, symbol, 7) 875 cur = cur.next 876 if info == None: 877 print "Function %s description has no <info>" % (symbol) 878 addFunction(symbol, file, "") 879 else: 880 info = string.replace(info, "'", " ") 881 info = string.strip(info) 882 addFunction(symbol, file, info) 883 addString(info, file, symbol, 5) 884 885 l = splitIdentifier(symbol) 886 for word in l: 887 addWord(word, file, symbol, 10) 888 889 return 1 890 891def analyzeAPISymbols(top): 892 count = 0 893 cur = top.children 894 895 while cur != None: 896 if cur.type == 'text': 897 cur = cur.next 898 continue 899 if cur.name == "macro": 900 count = count + analyzeAPIMacro(cur) 901 elif cur.name == "function": 902 count = count + analyzeAPIFunction(cur) 903 elif cur.name == "const": 904 count = count + analyzeAPIConst(cur) 905 elif cur.name == "typedef": 906 count = count + analyzeAPIType(cur) 907 elif cur.name == "struct": 908 count = count + analyzeAPIStruct(cur) 909 elif cur.name == "enum": 910 count = count + analyzeAPIEnum(cur) 911 elif cur.name == "functype": 912 count = count + analyzeAPIFunctype(cur) 913 else: 914 print "unexpected element %s in API doc <files>" % (cur.name) 915 cur = cur.next 916 return count 917 918def analyzeAPI(doc): 919 count = 0 920 if doc == None: 921 return -1 922 root = doc.getRootElement() 923 if root.name != "api": 924 print "Unexpected root name" 925 return -1 926 cur = root.children 927 while cur != None: 928 if cur.type == 'text': 929 cur = cur.next 930 continue 931 if cur.name == "files": 932 pass 933# count = count + analyzeAPIFiles(cur) 934 elif cur.name == "symbols": 935 count = count + analyzeAPISymbols(cur) 936 else: 937 print "unexpected element %s in API doc" % (cur.name) 938 cur = cur.next 939 return count 940 941######################################################################### 942# # 943# Web pages parsing and analysis # 944# # 945######################################################################### 946 947import glob 948 949def analyzeHTMLText(doc, resource, p, section, id): 950 words = 0 951 try: 952 content = p.content 953 words = words + addStringHTML(content, resource, id, section, 5) 954 except: 955 return -1 956 return words 957 958def analyzeHTMLPara(doc, resource, p, section, id): 959 words = 0 960 try: 961 content = p.content 962 words = words + addStringHTML(content, resource, id, section, 5) 963 except: 964 return -1 965 return words 966 967def analyzeHTMLPre(doc, resource, p, section, id): 968 words = 0 969 try: 970 content = p.content 971 words = words + addStringHTML(content, resource, id, section, 5) 972 except: 973 return -1 974 return words 975 976def analyzeHTML(doc, resource, p, section, id): 977 words = 0 978 try: 979 content = p.content 980 words = words + addStringHTML(content, resource, id, section, 5) 981 except: 982 return -1 983 return words 984 985def analyzeHTML(doc, resource): 986 para = 0; 987 ctxt = doc.xpathNewContext() 988 try: 989 res = ctxt.xpathEval("//head/title") 990 title = res[0].content 991 except: 992 title = "Page %s" % (resource) 993 addPage(resource, title) 994 try: 995 items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()") 996 section = title 997 id = "" 998 for item in items: 999 if item.name == 'h1' or item.name == 'h2' or item.name == 'h3': 1000 section = item.content 1001 if item.prop("id"): 1002 id = item.prop("id") 1003 elif item.prop("name"): 1004 id = item.prop("name") 1005 elif item.type == 'text': 1006 analyzeHTMLText(doc, resource, item, section, id) 1007 para = para + 1 1008 elif item.name == 'p': 1009 analyzeHTMLPara(doc, resource, item, section, id) 1010 para = para + 1 1011 elif item.name == 'pre': 1012 analyzeHTMLPre(doc, resource, item, section, id) 1013 para = para + 1 1014 else: 1015 print "Page %s, unexpected %s element" % (resource, item.name) 1016 except: 1017 print "Page %s: problem analyzing" % (resource) 1018 print sys.exc_type, sys.exc_value 1019 1020 return para 1021 1022def analyzeHTMLPages(): 1023 ret = 0 1024 HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html") 1025 for html in HTMLfiles: 1026 if html[0:3] == "API": 1027 continue 1028 if html == "xml.html": 1029 continue 1030 try: 1031 doc = libxml2.parseFile(html) 1032 except: 1033 doc = libxml2.htmlParseFile(html, None) 1034 try: 1035 res = analyzeHTML(doc, html) 1036 print "Parsed %s : %d paragraphs" % (html, res) 1037 ret = ret + 1 1038 except: 1039 print "could not parse %s" % (html) 1040 return ret 1041 1042######################################################################### 1043# # 1044# Mail archives parsing and analysis # 1045# # 1046######################################################################### 1047 1048import time 1049 1050def getXMLDateArchive(t = None): 1051 if t == None: 1052 t = time.time() 1053 T = time.gmtime(t) 1054 month = time.strftime("%B", T) 1055 year = T[0] 1056 url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month) 1057 return url 1058 1059def scanXMLMsgArchive(url, title, force = 0): 1060 if url == None or title == None: 1061 return 0 1062 1063 ID = checkXMLMsgArchive(url) 1064 if force == 0 and ID != -1: 1065 return 0 1066 1067 if ID == -1: 1068 ID = addXMLMsgArchive(url, title) 1069 if ID == -1: 1070 return 0 1071 1072 try: 1073 print "Loading %s" % (url) 1074 doc = libxml2.htmlParseFile(url, None); 1075 except: 1076 doc = None 1077 if doc == None: 1078 print "Failed to parse %s" % (url) 1079 return 0 1080 1081 addStringArchive(title, ID, 20) 1082 ctxt = doc.xpathNewContext() 1083 texts = ctxt.xpathEval("//pre//text()") 1084 for text in texts: 1085 addStringArchive(text.content, ID, 5) 1086 1087 return 1 1088 1089def scanXMLDateArchive(t = None, force = 0): 1090 global wordsDictArchive 1091 1092 wordsDictArchive = {} 1093 1094 url = getXMLDateArchive(t) 1095 print "loading %s" % (url) 1096 try: 1097 doc = libxml2.htmlParseFile(url, None); 1098 except: 1099 doc = None 1100 if doc == None: 1101 print "Failed to parse %s" % (url) 1102 return -1 1103 ctxt = doc.xpathNewContext() 1104 anchors = ctxt.xpathEval("//a[@href]") 1105 links = 0 1106 newmsg = 0 1107 for anchor in anchors: 1108 href = anchor.prop("href") 1109 if href == None or href[0:3] != "msg": 1110 continue 1111 try: 1112 links = links + 1 1113 1114 msg = libxml2.buildURI(href, url) 1115 title = anchor.content 1116 if title != None and title[0:4] == 'Re: ': 1117 title = title[4:] 1118 if title != None and title[0:6] == '[xml] ': 1119 title = title[6:] 1120 newmsg = newmsg + scanXMLMsgArchive(msg, title, force) 1121 1122 except: 1123 pass 1124 1125 return newmsg 1126 1127 1128######################################################################### 1129# # 1130# Main code: open the DB, the API XML and analyze it # 1131# # 1132######################################################################### 1133def analyzeArchives(t = None, force = 0): 1134 global wordsDictArchive 1135 1136 ret = scanXMLDateArchive(t, force) 1137 print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret) 1138 1139 i = 0 1140 skipped = 0 1141 for word in wordsDictArchive.keys(): 1142 refs = wordsDictArchive[word] 1143 if refs == None: 1144 skipped = skipped + 1 1145 continue; 1146 for id in refs.keys(): 1147 relevance = refs[id] 1148 updateWordArchive(word, id, relevance) 1149 i = i + 1 1150 1151 print "Found %d associations in HTML pages" % (i) 1152 1153def analyzeHTMLTop(): 1154 global wordsDictHTML 1155 1156 ret = analyzeHTMLPages() 1157 print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret) 1158 1159 i = 0 1160 skipped = 0 1161 for word in wordsDictHTML.keys(): 1162 refs = wordsDictHTML[word] 1163 if refs == None: 1164 skipped = skipped + 1 1165 continue; 1166 for resource in refs.keys(): 1167 (relevance, id, section) = refs[resource] 1168 updateWordHTML(word, resource, section, id, relevance) 1169 i = i + 1 1170 1171 print "Found %d associations in HTML pages" % (i) 1172 1173def analyzeAPITop(): 1174 global wordsDict 1175 global API 1176 1177 try: 1178 doc = loadAPI(API) 1179 ret = analyzeAPI(doc) 1180 print "Analyzed %d blocs" % (ret) 1181 doc.freeDoc() 1182 except: 1183 print "Failed to parse and analyze %s" % (API) 1184 print sys.exc_type, sys.exc_value 1185 sys.exit(1) 1186 1187 print "Indexed %d words" % (len(wordsDict)) 1188 i = 0 1189 skipped = 0 1190 for word in wordsDict.keys(): 1191 refs = wordsDict[word] 1192 if refs == None: 1193 skipped = skipped + 1 1194 continue; 1195 for (module, symbol) in refs.keys(): 1196 updateWord(word, symbol, refs[(module, symbol)]) 1197 i = i + 1 1198 1199 print "Found %d associations, skipped %d words" % (i, skipped) 1200 1201def usage(): 1202 print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]" 1203 sys.exit(1) 1204 1205def main(): 1206 try: 1207 openMySQL() 1208 except: 1209 print "Failed to open the database" 1210 print sys.exc_type, sys.exc_value 1211 sys.exit(1) 1212 1213 args = sys.argv[1:] 1214 force = 0 1215 if args: 1216 i = 0 1217 while i < len(args): 1218 if args[i] == '--force': 1219 force = 1 1220 elif args[i] == '--archive': 1221 analyzeArchives(None, force) 1222 elif args[i] == '--archive-year': 1223 i = i + 1; 1224 year = args[i] 1225 months = ["January" , "February", "March", "April", "May", 1226 "June", "July", "August", "September", "October", 1227 "November", "December"]; 1228 for month in months: 1229 try: 1230 str = "%s-%s" % (year, month) 1231 T = time.strptime(str, "%Y-%B") 1232 t = time.mktime(T) + 3600 * 24 * 10; 1233 analyzeArchives(t, force) 1234 except: 1235 print "Failed to index month archive:" 1236 print sys.exc_type, sys.exc_value 1237 elif args[i] == '--archive-month': 1238 i = i + 1; 1239 month = args[i] 1240 try: 1241 T = time.strptime(month, "%Y-%B") 1242 t = time.mktime(T) + 3600 * 24 * 10; 1243 analyzeArchives(t, force) 1244 except: 1245 print "Failed to index month archive:" 1246 print sys.exc_type, sys.exc_value 1247 elif args[i] == '--API': 1248 analyzeAPITop() 1249 elif args[i] == '--docs': 1250 analyzeHTMLTop() 1251 else: 1252 usage() 1253 i = i + 1 1254 else: 1255 usage() 1256 1257if __name__ == "__main__": 1258 main() 1259