1 /*---------------------------------------------------------------------------*
2 * grxmldoc.cpp *
3 * *
4 * Copyright 2007, 2008 Nuance Communciations, Inc. *
5 * *
6 * Licensed under the Apache License, Version 2.0 (the 'License'); *
7 * you may not use this file except in compliance with the License. *
8 * *
9 * You may obtain a copy of the License at *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
11 * *
12 * Unless required by applicable law or agreed to in writing, software *
13 * distributed under the License is distributed on an 'AS IS' BASIS, *
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15 * See the License for the specific language governing permissions and *
16 * limitations under the License. *
17 * *
18 *---------------------------------------------------------------------------*/
19
20 #include <assert.h>
21 #include <stdlib.h>
22 #include <fstream>
23 #include <sstream>
24 #include <iostream>
25 #include <algorithm> // for std::sort
26 #include "tinyxml.h"
27 #include "grph.h" // The word graph object and interface
28 #include "sub_grph.h" // The sub-graph object and interface
29 #include "hashmap.h"
30 #include "grxmldoc.h"
31 #include "ESR_Session.h"
32 //#include "LCHAR.h"
33
34 #define GRXML_DEBUG 0
35 #define MAX_PATH_NAME 512
36
37 #define FATAL_ERROR(x,y) { std::cout << (x) << std::endl; exit ((y)); }
38 #define WARNING(x) std::cout << (x) << std::endl;
39
40 #if GRXML_DEBUG
41 //#define DEBUG_PRINT(x) //
42 #define DEBUG_PRINT(x) std::cout << (x) << std::endl;
43 #define PRINT_EXPRESSION(x)
44 //#define PRINT_EXPRESSION(x) std::cout << (x) << std::endl;
45 #else
46 #define DEBUG_PRINT(x) //
47 #define PRINT_EXPRESSION(x) //
48
49 #endif
50
51 using namespace std;
52
53 #define CHECK_NOT_EMPTY(s, t) { if (s.empty()) \
54 { \
55 std::cout << "ERROR: Empty string of type " << t <<std::endl; \
56 } \
57 }
58
get_range(const std::string & s,int * minCnt,int * maxCnt)59 int get_range(const std::string& s, int* minCnt, int* maxCnt)
60 {
61 std::string sval;
62 size_t p1 =s.find("-");
63 if ( p1 !=string::npos ) {
64 sval.assign( s, 0, p1 );
65 if(strspn(sval.c_str(),"0123456789")<1) return 1;
66 *minCnt = atoi( sval.c_str() );
67 sval.assign( s, p1+1, s.size() );
68 *maxCnt = -1; // 0== any?
69 // If max is given then use BeginCount otherwise use BeginItemRepeat
70 if (!sval.empty() ) {
71 if(strspn(sval.c_str(),"0123456789")<1) return 1;
72 *maxCnt = atoi( sval.c_str() );
73 }
74 return 0;
75 }
76 p1 = s.find("+");
77 if( p1 != string::npos) {
78 sval.assign( s, 0, p1 );
79 if(strspn(sval.c_str(),"0123456789")<1) return 1;
80 *minCnt = atoi( sval.c_str() );
81 *maxCnt = -1;
82 return 0;
83 }
84 if(strspn(s.c_str(),"0123456789")<1) return 1;
85 *minCnt = *maxCnt = atoi( s.c_str());
86 return 0;
87 }
88
GRXMLDoc()89 GRXMLDoc::GRXMLDoc()
90 {
91 m_NodeKeyWords.insert(make_pair("grammar", NodeTypeGrammar));
92 m_NodeKeyWords.insert(make_pair("rule", NodeTypeRule));
93 m_NodeKeyWords.insert(make_pair("ruleref", NodeTypeRuleReference));
94 m_NodeKeyWords.insert(make_pair("one-of", NodeTypeOneOf));
95 m_NodeKeyWords.insert(make_pair("item", NodeTypeItem));
96 m_NodeKeyWords.insert(make_pair("tag", NodeTypeTag));
97 m_NodeKeyWords.insert(make_pair("count", NodeTypeCount));
98 m_NodeKeyWords.insert(make_pair("meta", NodeTypeMeta));
99 m_pGraph = 0;
100 m_RuleAutoIndex = 0;
101 m_TagAutoIndex = 0;
102 m_LabelAutoIndex = 0;
103 m_ExpandedRulesAutoIndex = 0;
104 m_XMLFileName = "dummy.xml";
105 }
106
107
~GRXMLDoc()108 GRXMLDoc::~GRXMLDoc()
109 {
110 deleteRules();
111 if (m_pGraph) {
112 delete m_pGraph;
113 }
114 }
115
116
parseGrammar(XMLNode & node,std::string & xMLFileName)117 bool GRXMLDoc::parseGrammar( XMLNode &node, std::string & xMLFileName )
118 {
119 m_XMLFileName = xMLFileName;
120 // Set up the internally defined rules, etc.
121 initializeLists();
122 // The top level "document" node is given to this fn
123 // Create the container for the word graph.
124 if (m_pGraph) {
125 delete m_pGraph;
126 }
127 m_pGraph = new Graph("XML grammar");
128 SubGraph *p_SubGraph;
129
130 parseNode( node, p_SubGraph, 1 ); // NB Subgraph pointed to will change in recursive fn.
131
132 if (findSubGraph( m_RootRule, p_SubGraph )) {
133 m_pGraph->ExpandRules (p_SubGraph);
134 p_SubGraph->RemoveInternalConnections ();
135 //Print the root rule.
136 //printSubgraph( *p_SubGraph );
137 }
138 return true;
139 }
140
141
parseNode(XMLNode & node,SubGraph * & p_SubGraph,const unsigned int level)142 bool GRXMLDoc::parseNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level )
143 {
144 // We will create a new subgraph for each rule node.
145 // The "current" subgraph is substituted with the new subgraph for all ops on child nodes.
146 // After processing child nodes the original subgraph is reinstated
147 // for final operations in the endNode() fn.
148
149 // Initial processing of the current node before processing children
150 #if 0 && GRXML_DEBUG
151 if(node.Type() == TiXmlNode::ELEMENT)
152 node.ToElement()->Print( stdout, level);
153 else if(node.Type() == TiXmlNode::DOCUMENT)
154 node.ToDocument()->Print( stdout, level);
155 else if(node.Type() == TiXmlNode::TEXT)
156 node.ToText()->Print( stdout, level);
157 else if(node.Type() == TiXmlNode::DECLARATION)
158 node.ToDeclaration()->Print( stdout, level);
159 else {
160 const char* text = node.Value();
161 if(!text) text = "__NULL__";
162 printf("processing node type %d text %s\n", node.Type(), text);
163 }
164 #endif
165 beginNode( node, p_SubGraph, level );
166
167 SubGraph *p_LocalSubGraph;
168 p_LocalSubGraph = p_SubGraph;
169 TiXmlNode* child;
170 for( child = node.FirstChild(); child; child = child->NextSibling() )
171 {
172 parseNode ( *child, p_SubGraph, level+1 );
173 }
174 // Revert current node
175 p_SubGraph = p_LocalSubGraph;
176
177 // Finish processing current node
178 endNode( node, p_SubGraph, level );
179
180 return true;
181 } // parseNode
182
183
beginNode(XMLNode & node,SubGraph * & p_SubGraph,const unsigned int level)184 bool GRXMLDoc::beginNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level )
185 {
186 std::string name = node.Value();
187 DEBUG_PRINT("Element = " + name);
188
189 // XMLNode::Type type = node.getType();
190 if ( node.Type() == TiXmlNode::TEXT) // isCData()
191 {
192 const char* cc_name = node.Parent()->Value();
193 std::string str_name(cc_name);
194 DEBUG_PRINT (std::string("CDATA ") + name);
195 DEBUG_PRINT (std::string("CDATA ") + str_name);
196
197 processCDATA( node, p_SubGraph );
198 }
199 else if ( node.Type()== TiXmlNode::ELEMENT /*isNode()*/ || node.NoChildren() /*isLeaf()*/)
200 {
201 //printNode(node, level);
202 // Use enum value
203 KEYWDPAIR::iterator pos;
204 pos = m_NodeKeyWords.find( name );
205 KeywordValues nodeType = NodeTypeBadValue;
206 if ( pos != m_NodeKeyWords.end() )
207 {
208 nodeType = (*pos).second;
209 DEBUG_PRINT("nodeType=" + nodeType);
210 } else if(node.Type() == TiXmlNode::COMMENT) {
211 return true;
212 } else if(node.Type() == TiXmlNode::DECLARATION && name.length()==0) {
213 return true;
214 } else {
215 FATAL_ERROR( std::string("Error: unknown tag ") + name, ESR_INVALID_ARGUMENT);
216 }
217
218 switch ( nodeType )
219 {
220 case NodeTypeGrammar:
221 {
222 beginParseGrammarNode( node );
223 }
224 break;
225 case NodeTypeRule:
226 {
227 // NB This fn creates a new subgraph.
228 beginParseRuleNode( node, p_SubGraph );
229 }
230 break;
231 case NodeTypeRuleReference:
232 {
233 // NB This fn creates a new subgraph.
234 beginRuleRef( node, p_SubGraph );
235 }
236 break;
237 case NodeTypeOneOf:
238 {
239 beginOneOf( node, p_SubGraph );
240 }
241 break;
242 case NodeTypeItem:
243 {
244 beginItem( node, p_SubGraph );
245 }
246 break;
247 case NodeTypeTag:
248 {
249 beginTag( node, p_SubGraph );
250 }
251 break;
252 case NodeTypeCount:
253 {
254 beginCount( node, p_SubGraph );
255 }
256 break;
257 case NodeTypeMeta:
258 {
259 beginParseMetaNode( node );
260 }
261 break;
262 case NodeTypeBadValue:
263 default:
264 DEBUG_PRINT( "UNKNOWN node name: " + name );
265 break;
266 }; // switch
267 } //is a Node or Leaf
268 else if ( node.Type() == TiXmlNode::TEXT) // isCData()
269 {
270 DEBUG_PRINT (std::string("CDATA ") + name);
271 processCDATA( node, p_SubGraph );
272 }
273 return true;
274 } // beginNode()
275
276
endNode(XMLNode & node,SubGraph * & p_SubGraph,const unsigned int level)277 bool GRXMLDoc::endNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level )
278 {
279 std::string name = node.Value();
280 //XMLNode::Type type = node.getType();
281
282 if ( node.Type()== TiXmlNode::ELEMENT /*isNode()*/ || node.NoChildren() )
283 {
284 KEYWDPAIR::iterator pos;
285 pos = m_NodeKeyWords.find( name );
286 KeywordValues nodeType = NodeTypeBadValue;
287 if ( pos != m_NodeKeyWords.end() )
288 {
289 nodeType = (*pos).second;
290 } else if(node.Type() == TiXmlNode::COMMENT) {
291 return true;
292 } else if(node.Type() == TiXmlNode::DECLARATION && name.length()==0) {
293 return true;
294 } else if(node.Type() == TiXmlNode::TEXT) {
295
296 } else {
297 FATAL_ERROR( std::string("Error: unknown tag ") + name, ESR_INVALID_ARGUMENT );
298 }
299
300 switch ( nodeType )
301 {
302 case NodeTypeGrammar:
303 {
304 endParseGrammarNode( node );
305 }
306 break;
307 case NodeTypeRule:
308 {
309 endParseRuleNode( node, p_SubGraph );
310 }
311 break;
312 case NodeTypeRuleReference:
313 {
314 endRuleRef( node, p_SubGraph );
315 }
316 break;
317 case NodeTypeOneOf:
318 {
319 endOneOf( node, p_SubGraph );
320 }
321 break;
322 case NodeTypeItem:
323 {
324 endItem(node, p_SubGraph );
325 }
326 break;
327 case NodeTypeTag:
328 {
329 endTag( node, p_SubGraph );
330 }
331 break;
332 case NodeTypeCount:
333 {
334 endCount( node, p_SubGraph );
335 }
336 break;
337 case NodeTypeMeta:
338 {
339 endParseMetaNode( node );
340 }
341 break;
342 case NodeTypeBadValue:
343 default:
344 DEBUG_PRINT( "UNKNOWN node name: ");
345 DEBUG_PRINT( name.c_str() );
346 //Extend the
347 break;
348 }; // switch
349 } //isNode() or isLeaf()
350 else
351 {
352 // Do nothing?
353 }
354 return true;
355 } // endNode()
356
357
beginParseGrammarNode(XMLNode & node)358 bool GRXMLDoc::beginParseGrammarNode(XMLNode &node)
359 {
360 const char* attr;
361 #define GETATTR(nAmE) ((attr=node.ToElement()->Attribute(nAmE))!=NULL) ? attr:""
362 m_XMLMode = GETATTR("mode");
363 m_XMLLanguage = GETATTR("xml:lang");
364 m_RootRule = GETATTR("root"); // The root rule name
365
366 DEBUG_PRINT("Root rule = " + m_RootRule);
367
368 m_XMLTagFormat = GETATTR("tag-format");
369 m_XMLVersion = GETATTR("version");
370 m_XMLBase = GETATTR("xml:base");
371 return true;
372 }
373
beginParseMetaNode(XMLNode & node)374 bool GRXMLDoc::beginParseMetaNode(XMLNode &node)
375 {
376 const char* attr;
377 std::string meta_name = GETATTR("name");
378 std::string meta_value = GETATTR("content");
379
380 if(meta_name == "word_penalty") {
381 m_MetaKeyValPairs.insert(meta_name,meta_value);
382 // m_MetaKeyValPairs.print();
383 } else if(meta_name == "do_skip_interword_silence") {
384 for(int j = 0; j<(int)meta_value.size(); j++){
385 meta_value[j] = tolower(meta_value[j]); //lower();
386 }
387 if(meta_value!="true" && meta_value!="false")
388 printf ("\nWarning: %s must be set to 'true' or 'false'; defaulting to 'false'\n", meta_name.c_str());
389 else
390 m_MetaKeyValPairs.insert(meta_name,meta_value);
391 } else if(meta_name == "userdict_name") {
392 printf ("\nWarning: ignoring unsupported meta %s %s\n", meta_name.c_str(), meta_value.c_str());
393 } else {
394 printf ("\nWarning: ignoring unsupported meta %s %s\n", meta_name.c_str(), meta_value.c_str());
395 }
396 return true;
397 }
398
399
endParseGrammarNode(XMLNode & node)400 bool GRXMLDoc::endParseGrammarNode(XMLNode &node)
401 {
402 // End parse operations
403 return true;
404 }
405
406
beginParseRuleNode(XMLNode & node,SubGraph * & p_SubGraph)407 bool GRXMLDoc::beginParseRuleNode( XMLNode &node, SubGraph *&p_SubGraph)
408 {
409 const char* attr;
410 // Note: The subGraph may change if there are forward references. This
411 // is fine as we revert to the previous one when finished parsing the current node.
412 DEBUG_PRINT ( "---- Rule\n" );
413 std::string ruleName = GETATTR("id" );
414 std::string s_tag = GETATTR("tag" );
415 if( s_tag.length()>0) {
416 FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
417 }
418 CHECK_NOT_EMPTY( ruleName, "id" );
419 // Rule name must be unique within scope of entire grammar.
420 // Put rule on stack - for context
421 m_RuleListStack.push( ruleName );
422
423 // Check whether a ruleref placeholder exists for this rule.
424 int index;
425 bool foundRule = findRuleIndex( ruleName, index );
426 if (foundRule) {
427 // Rule is already declared; it must have been forward referenced
428 // so swap the placeholder subgraph in.
429 // NB subgraph and rule name are already known to lists.
430 SubGraph *p_ExistingSubgraph;
431 if ( findSubGraph( ruleName, p_ExistingSubgraph ) ) {
432 p_SubGraph = p_ExistingSubgraph;
433 }
434 else {
435 FATAL_ERROR("ERROR! Subgraph without rule name entry found!", -1);
436 }
437 }
438 else {
439 // Create a Word Graph node for each rule node
440 SubGraph *newGraph;
441 addRuleToList( ruleName, newGraph );
442 p_SubGraph = newGraph;
443 }
444
445 // Make a note of the scope or rules; public, etc - used in map file.
446 findRuleIndex( ruleName, index );
447 std::string ruleScope = GETATTR("scope" );
448 if ( !ruleScope.empty() ) {
449 m_RuleScope.insert(index, ruleScope);
450 }
451
452 // We must accommodate Rules that have CDATA without an <item> element.
453 // We need to infer this element for all rules.
454 m_pGraph->BeginItem( p_SubGraph );
455
456 PRINT_EXPRESSION( ruleName + " = { " );
457 return true;
458 } // beginParseRuleNode()
459
460
endParseRuleNode(XMLNode & node,SubGraph * & p_SubGraph)461 bool GRXMLDoc::endParseRuleNode( XMLNode &node, SubGraph *&p_SubGraph )
462 {
463 // The rule expression has been built as a subgraph and ID added to the rule list.
464 // Finished editing subgraph
465 DEBUG_PRINT ( "---- /Rule\n" );
466 //m_pGraph->EndRule(&p_SubGraph);
467 // Tell the world
468 //std::string ruleName = attr.get( "id" );
469 std::string ruleName = m_RuleListStack.top();
470 m_RuleListStack.pop();
471 //CHECK_NOT_EMPTY( ruleName, "id" );
472 // Must be unique rule name within scope of entire grammar.
473 // Check whether a ruleref placeholder exists for this rule.
474 m_pGraph->addSubGraph ( p_SubGraph );
475
476 // We must accommodate Rules that have CDATA without an <item> element.
477 // We need to infer this element for all rules.
478 m_pGraph->EndItem( p_SubGraph );
479
480 PRINT_EXPRESSION( " }\n" );
481 return true;
482 }
483
processCDATA(XMLNode & node,SubGraph * & p_SubGraph)484 bool GRXMLDoc::processCDATA( XMLNode &node, SubGraph *&p_SubGraph )
485 {
486 // Note the Item's CDATA
487 // Strip leading and trailing whitespace
488 const char* cc_name = node.Parent()->Value();
489 std::string str_name(cc_name); // = node.Parent()->ValueStr(); // getName
490 // std::string name = node.Parent()->Value(); // getName
491 //if ( name == "item" ) {
492 if ( str_name != "tag" ) {
493
494 const char* const whitespace = " \t\r\n\v\f";
495 std::string cdata = node.Value(); // getCData()
496 std::string word; // Words are whitespace separated
497
498 cdata.erase(0, cdata.find_first_not_of(whitespace) );
499 cdata.erase(cdata.find_last_not_of(whitespace) + 1);
500 #if GRXML_DEBUG
501 std::cout << "/--" << cdata << "--/\n";
502 #endif
503
504 std::string::size_type begIdx, endIdx;
505
506 //search beginning of the first word
507 begIdx = cdata.find_first_not_of(whitespace);
508
509 //while beginning of a word found
510 while (begIdx != std::string::npos) {
511 //search end of the actual word
512 endIdx = cdata.find_first_of (whitespace, begIdx);
513 if (endIdx == string::npos) {
514 //end of word is end of line
515 endIdx = cdata.length();
516 }
517 word.clear();
518 // word.assign(cdata,begIdx,endIdx);
519 word.append (cdata, begIdx, endIdx - begIdx);
520 if ( !word.empty() )
521 {
522 #if GRXML_DEBUG
523 std::cout << " -->" << word << "<--\n";
524 #endif
525 int index;
526 // If a slot then take note of rule name
527 if ( IsSlot( word ) ) {
528 const char* xmlBasename;
529 std::string ruleName = m_RuleListStack.top();
530 m_SlotList.insert(index, ruleName);
531 xmlBasename = strrchr(m_XMLFileName.c_str(),'/');
532 xmlBasename = xmlBasename ? xmlBasename+1 : m_XMLFileName.c_str();
533 word = (std::string)xmlBasename + "." + ruleName + "@" + word;
534 addLabelToList( word );
535 findLabelIndex( word, index );
536 } else {
537 addLabelToList( word );
538 findLabelIndex( word, index );
539 }
540 m_pGraph->AddLabel( p_SubGraph, index );
541 }
542 begIdx = cdata.find_first_not_of (whitespace, endIdx);
543
544 }
545 } //tag
546 else {
547 // Do nothing with CDATA for elements that are not items.
548 // In particular, do not strip whitespace from tag cdata.
549 // However, CPPDOM appears to remove linefeeds. May need to tidy up.
550
551 }
552 return true;
553 } // cdata
554
beginItem(XMLNode & node,SubGraph * & p_SubGraph)555 bool GRXMLDoc::beginItem( XMLNode &node, SubGraph *&p_SubGraph )
556 {
557 const char* attr;
558 DEBUG_PRINT ("---- Item:\n");
559 // First check whethere there is a count/repeat
560 std::string s = GETATTR("repeat" );
561 int minCnt=0,maxCnt=0;
562 std::string s_tag = GETATTR("tag" );
563 if( s_tag.length()>0) {
564 FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
565 }
566 if( s.length()>0 && get_range( s, &minCnt, &maxCnt) ) {
567 FATAL_ERROR(std::string("error: while parsing range ") + s,1);
568 }
569 if ( !s.empty() ) {
570 // RED FLAG: max should not be 0! A +ve number should have been given.
571 if( maxCnt>0) {
572 m_pGraph->BeginCount( p_SubGraph, minCnt, maxCnt );
573 }
574 else {
575 // NB: BeginItemRepeat can only use min of 0 or 1!
576 m_pGraph->BeginItemRepeat ( p_SubGraph, minCnt, -1);
577 }
578 }
579 else {
580 m_pGraph->BeginItem( p_SubGraph );
581 }
582 return true;
583 }
584
585
endItem(XMLNode & node,SubGraph * & p_SubGraph)586 bool GRXMLDoc::endItem( XMLNode &node, SubGraph *&p_SubGraph )
587 {
588 DEBUG_PRINT ( "---- /Item\n" );
589
590 // What TODO if no tag for an item?
591
592 m_pGraph->EndItem( p_SubGraph );
593 return true;
594 }
595
596
beginRuleRef(XMLNode & node,SubGraph * & p_SubGraph)597 bool GRXMLDoc::beginRuleRef( XMLNode &node, SubGraph *&p_SubGraph )
598 {
599 // Extend word FST node with an entire FST subgraph.
600 // Forward referencing of rules is supported.
601 // NB Remove the leading # from the ruleref name!
602 DEBUG_PRINT ( "---- Ruleref\n" );
603
604 const char* attr;
605 std::string s_tag = GETATTR("tag" );
606 if( s_tag.length()>0) {
607 FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
608 }
609 std::string s = GETATTR("uri" );
610 if (s.empty())
611 {
612 //
613 FATAL_ERROR( "ERROR! Ruleref specifies no uri name!", -1 );
614 }
615 // Remove the #:
616 int p1 = s.find("#");
617 if ( p1 !=0 ) {
618 FATAL_ERROR( "ERROR! bad ruleref name: '" + s + "'" + ". Rule reference must start with a '#'. External references are not supported.", -1 );
619 }
620 string ruleName;
621 getRuleRefName( node, ruleName );
622
623 //std::string parentRuleName = m_RuleListStack.top();
624 //addRuleDependency( parentRuleName, ruleName );
625
626 int index;
627 bool foundRule = findRuleIndex( ruleName, index );
628 if (!foundRule) {
629 // Forward reference; create a placeholder subgraph ptr.
630 //SubGraph *newGraph = new SubGraph( (char *) ruleName.c_str() );
631 // RED FLAG: Remember to check fwd ref rule was filled in at end.
632 SubGraph *newGraph;
633 addRuleToList( ruleName, newGraph );
634 findRuleIndex( ruleName, index );
635 }
636 // We can now treat a forward-referenced graph as if it was defined.
637 // We will add the subgraph when we have the tag - see endItem().
638 m_pGraph->BeginRule( p_SubGraph );
639 m_pGraph->AddRuleRef( p_SubGraph, index );
640 m_pGraph->EndRule( p_SubGraph );
641
642 return true;
643 }
644
645
endRuleRef(XMLNode & grmNode,SubGraph * & p_SubGraph)646 bool GRXMLDoc::endRuleRef(XMLNode &grmNode, SubGraph *&p_SubGraph )
647 {
648 DEBUG_PRINT ( "---- /Ruleref\n" );
649 // Does nothing
650 // NB The tag is not under the ruleref element - it is in the current item element.
651 // We now add the tag of the AddRuleRef as we see the tag element. See EndTag().
652
653 return true;
654 }
655
656
beginOneOf(XMLNode & grmNode,SubGraph * & p_SubGraph)657 bool GRXMLDoc::beginOneOf(XMLNode &grmNode, SubGraph *&p_SubGraph)
658 {
659 DEBUG_PRINT ( "----OneOf\n" );
660 m_pGraph->BeginOneOf (p_SubGraph);
661 return true;
662 }
663
664
endOneOf(XMLNode & grmNode,SubGraph * & p_SubGraph)665 bool GRXMLDoc::endOneOf(XMLNode &grmNode, SubGraph *&p_SubGraph)
666 {
667 DEBUG_PRINT ( "----/OneOf\n" );
668 m_pGraph->EndOneOf (p_SubGraph);
669 return true;
670 }
671
672
beginTag(XMLNode & node,SubGraph * & p_SubGraph)673 bool GRXMLDoc::beginTag( XMLNode &node, SubGraph *&p_SubGraph )
674 {
675 DEBUG_PRINT ("---- Tag\n");
676 std::string s = node.ToElement()->GetText(); // getCdata();
677 #if GRXML_DEBUG
678 std::cout << s; // debug
679 #endif
680 // Store the semantic tag info.
681 // NB Do not strip whitespace from tag cdata
682 if ( !s.empty() )
683 {
684 int index;
685 addTagToList( s );
686 findTagIndex( s, index );
687 m_pGraph->AddTag ( p_SubGraph, index );
688 }
689
690 return true;
691 }
692
693
endTag(XMLNode & node,SubGraph * & p_SubGraph)694 bool GRXMLDoc::endTag( XMLNode &node, SubGraph *&p_SubGraph )
695 {
696 DEBUG_PRINT ("---- /Tag\n");
697 return true;
698 }
699
700
beginCount(XMLNode & node,SubGraph * & p_SubGraph)701 bool GRXMLDoc::beginCount( XMLNode &node, SubGraph *&p_SubGraph )
702 {
703 const char* attr;
704 // Count of reps applies to the text elements in this count node
705 DEBUG_PRINT ("---- Count\n");
706 // Get number attr
707 std::string s = GETATTR("number");
708 std::string s_tag = GETATTR("tag" );
709 if( s_tag.length()>0) {
710 FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
711 }
712 if (s.empty()) {
713 return false;
714 }
715 // not in subgraph but in graph?!
716 //graph.BeginCount(n);
717
718 int minCnt=-1, maxCnt=-1;
719 if( get_range( s, &minCnt, &maxCnt) ) {
720 FATAL_ERROR(std::string("error: while parsing range ") + s,1);
721 }
722 if ( s.c_str() == std::string("optional") )
723 {
724 m_pGraph->BeginOptional( p_SubGraph );
725 }
726 else if ( minCnt>0 && maxCnt>0)
727 {
728 m_pGraph->BeginCount( p_SubGraph, minCnt, maxCnt );
729 }
730 else if( minCnt>0 )
731 {
732 m_pGraph->BeginItemRepeat ( p_SubGraph, minCnt, -1);
733 }
734 else { //
735 m_pGraph->BeginOptional ( p_SubGraph );
736 }
737
738 return true;
739 }
740
741
endCount(XMLNode & node,SubGraph * & p_SubGraph)742 bool GRXMLDoc::endCount( XMLNode &node, SubGraph *&p_SubGraph )
743 {
744 DEBUG_PRINT ("---- /Count\n");
745 m_pGraph->EndCount( p_SubGraph );
746 return true;
747 }
748
endParseMetaNode(XMLNode & node)749 bool GRXMLDoc::endParseMetaNode(XMLNode &node)
750 {
751 // End parse operations
752 return true;
753 }
754
printNode(XMLNode & node,int level)755 void GRXMLDoc::printNode(XMLNode &node, int level)
756 {
757 std::string name = node.Value();
758 int type = node.Type();
759 std::string c_data;
760
761 for(int i=0;i<level;i++) std::cout << " ";
762
763 char c = ' ';
764 switch(type)
765 {
766 case TiXmlNode::ELEMENT:
767 // case XMLNode::xml_nt_node: // grammar, rule, one-of, item, count
768 c = '+';
769 break;
770 /* case TiXmlNode::TEXT:
771 // case XMLNode::xml_nt_leaf:
772 c = '-';
773 break; */
774 case TiXmlNode::DOCUMENT:
775 // case XMLNode::xml_nt_document:
776 c = '\\';
777 break;
778 case TiXmlNode::TEXT:
779 // case XMLNode::xml_nt_cdata:
780 c = '#';
781 c_data = node.Value(); // getCdata();
782 break;
783 case TiXmlNode::UNKNOWN:
784 case TiXmlNode::COMMENT:
785 case TiXmlNode::TYPECOUNT:
786 case TiXmlNode::DECLARATION:
787 default:
788 std::cout << "Error: not sure what to do here" << std::endl;
789 break;
790 }
791 if(node.Type() == TiXmlNode::TEXT) // isCData()
792 std::cout << c << name.c_str() << "[" << c_data << "]" << std::endl;
793 //Extend the tag hashtable
794 else
795 std::cout << c << name.c_str() << std::endl;
796
797 if( node.Type() == TiXmlNode::ELEMENT) {
798
799 for(TiXmlAttribute* attr=node.ToElement()->FirstAttribute();
800 attr; attr=attr->Next() ) {
801
802 // guru: added output of attributes
803 for (int i=0; i<level; i++)
804 std::cout << " ";
805 std::cout << " ";
806 std::cout << attr->Name() << ": " << attr->Value() << std::endl;
807 }
808 }
809
810 }
811
812 /** Function: addRuleToList
813 Extends list of SubGraphs with given subGraph
814 and extends list of rule names too.
815 TODO: Can we use one hash and use internal numeric index for rule IDs?
816 */
817
818
addRuleToList(std::string const & ruleName,SubGraph * & p_SubGraph)819 bool GRXMLDoc::addRuleToList(std::string const & ruleName, SubGraph *&p_SubGraph)
820 {
821 int index;
822 if ( findRuleIndex ( ruleName, index ) ) {
823 FATAL_ERROR("ERROR! Rule name " + ruleName + " is already defined!", -1 );
824 }
825
826 addLabelToList( m_XMLFileName + "@" + ruleName);
827 findLabelIndex( m_XMLFileName + "@" + ruleName, index );
828 #if GRXML_DEBUG
829 std::cout << "Rule " << ruleName << std::endl;
830 #endif
831 // Create the new subgraph and update lists
832 m_RuleList.insert( ruleName, index );
833 p_SubGraph = new SubGraph( (char *) ruleName.c_str(), index );
834
835 bool success = m_SubgraphList.insert( ruleName, p_SubGraph );
836 if (!success) {
837 FATAL_ERROR("ERROR! subgraph for " + ruleName + " is already defined!", -1 );
838 }
839 #if ADD_BRACES
840 addLabelToList( "{" );
841 std::stringstream ss;
842 ss << "}(" << index << ")";
843 addLabelToList( ss.str());
844 #endif
845 return success;
846 }
847
848
deleteRules()849 bool GRXMLDoc::deleteRules()
850 {
851 // Delete all allocated subgraphs.
852 // The rule strings are part of the hashtables and get deleted by them.
853 int index;
854 SubGraph *p_SubGraph;
855 std::string ruleName;
856 while ( !m_RuleList.isEmpty() ) {
857 m_RuleList.getFirst( &ruleName, &index );
858 m_RuleList.remove( ruleName );
859 if (m_SubgraphList.getValue( ruleName, &p_SubGraph ) ) {
860 delete p_SubGraph;
861 }
862 else {
863 FATAL_ERROR("No subgraph for rule " + ruleName + "! Mismatched rules and subgraph hashtables!", -1);
864 }
865 }
866 m_SubgraphList.clear();
867 m_RuleList.clear();
868 m_LabelList.clear();
869 m_TagList.clear();
870 return true;
871 }
872
findSubGraph(std::string & s,SubGraph * & p_SubGraph)873 bool GRXMLDoc::findSubGraph(std::string & s, SubGraph *&p_SubGraph)
874 {
875 return m_SubgraphList.getValue(s, &p_SubGraph);
876 }
877
findRule(int i,std::string & s)878 bool GRXMLDoc::findRule(int i, std::string &s )
879 {
880 return m_RuleList.getIndex( i, &s );
881 }
882
findTag(int i,std::string & s)883 bool GRXMLDoc::findTag(int i, std::string &s )
884 {
885 return m_TagList.getValue( i, &s );
886 }
887
findLabel(int i,std::string & s)888 bool GRXMLDoc::findLabel(int i, std::string &s )
889 {
890 return m_LabelList.getValue( i, &s );
891 }
892
findSubGraphIndex(SubGraph * p_SubGraph,std::string & s)893 bool GRXMLDoc::findSubGraphIndex( SubGraph *p_SubGraph, std::string &s )
894 {
895 return m_SubgraphList.getIndex( p_SubGraph, &s );
896 }
897
findRuleIndex(std::string s,int & i)898 bool GRXMLDoc::findRuleIndex( std::string s, int &i )
899 {
900 return m_RuleList.getValue( s, &i );
901 }
findTagIndex(std::string s,int & i)902 bool GRXMLDoc::findTagIndex( std::string s, int &i )
903 {
904 return m_TagList.getIndex( s, &i );
905 }
findLabelIndex(std::string s,int & i)906 bool GRXMLDoc::findLabelIndex( std::string s, int &i )
907 {
908 return m_LabelList.getIndex( s, &i );
909 }
findMeta(const std::string & sn,std::string & s)910 bool GRXMLDoc::findMeta(const std::string & sn, std::string &s)
911 {
912 return m_MetaKeyValPairs.getValue( sn, &s );
913 }
setMeta(const std::string & sn,const std::string & s)914 bool GRXMLDoc::setMeta(const std::string & sn, const std::string &s)
915 {
916 std::string tmp;
917 if(findMeta(sn,tmp))
918 m_MetaKeyValPairs.remove(sn);
919 return m_MetaKeyValPairs.insert(sn,s);
920 }
921
addTagToList(std::string const & s)922 bool GRXMLDoc::addTagToList( std::string const& s )
923 {
924 bool success = true;
925 // Make values unique
926 int index;
927 if ( !findTagIndex( s, index ) )
928 success = m_TagList.insert( m_TagAutoIndex++, s );
929 return success;
930 }
931
932
addLabelToList(std::string const & s)933 bool GRXMLDoc::addLabelToList( std::string const& s )
934 {
935 // TODO: Labels should be unique. Change key.
936 int index;
937 bool bRes = m_LabelList.getIndex( s, &index );
938 if(bRes == true) {
939 return false; // exists
940 }
941 bRes = m_LabelList.insert( m_LabelAutoIndex++, s );
942 return bRes;
943 }
944
printLists()945 void GRXMLDoc::printLists()
946 {
947 m_SubgraphList.print();
948 m_RuleList.print();
949 m_TagList.print();
950 m_LabelList.print();
951 }
952
953
printSubgraphs()954 void GRXMLDoc::printSubgraphs()
955 {
956 SubGraph *p_SubGraph;
957 std::string rule;
958 int index;
959 if ( m_RuleList.getFirst( &rule, &index) ) {
960 if ( findSubGraph( rule, p_SubGraph ) ) {
961 DEBUG_PRINT("============ Rule: " + rule + "============");
962 printSubgraph( *p_SubGraph );
963 while ( m_RuleList.getNext( &rule, &index) ) {
964 if ( findSubGraph( rule, p_SubGraph ) ) {
965 printSubgraph( *p_SubGraph );
966 }
967 }
968 }
969 }
970 }
971
972
printSubgraph(SubGraph & p_SubGraph)973 void GRXMLDoc::printSubgraph( SubGraph &p_SubGraph )
974 {
975 p_SubGraph.PrintWithLabels( *this );
976 }
977
978
getRuleRefName(XMLNode & node,std::string & ruleName)979 bool GRXMLDoc::getRuleRefName(XMLNode &node, std::string &ruleName)
980 {
981 const char* attr;
982 std::string s = GETATTR("uri" );
983 if (s.empty()) {
984 FATAL_ERROR( "ERROR! Ruleref specifies no uri name!", -1 );
985 }
986 // Remove the #:
987 int p1 = s.find("#");
988 if ( p1 !=0 ) {
989 FATAL_ERROR( "ERROR! bad ruleref name: '" + s + "'", -1 );
990 }
991 ruleName.assign( s, 1, s.size() );
992 return true;
993 }
994
initializeLists()995 void GRXMLDoc::initializeLists()
996 {
997 m_SubgraphList.setName("Subgraphs");
998 m_RuleList.setName("Rules");
999 m_TagList.setName("Tags");
1000 m_LabelList.setName("Labels");
1001
1002 /* Predefined rules. NB Labels are also created for each rule added.
1003 // The required order for these labels in the .map output file is:
1004 // 0 eps
1005 // next come slots
1006 // pau and pau2
1007 // everything else
1008 // We will add all these now in case they are referenced and we will
1009 // reindex after we have parsed the grammar -- when we have the list
1010 // of slots. This re-indexing is for the output files .map and .P.txt.
1011 //
1012 */
1013 addLabelToList( "eps" );
1014
1015 addLabelToList( "-pau-" );
1016 addLabelToList( "-pau2-" );
1017 }
1018
writeMapFile(std::string & fileName)1019 void GRXMLDoc::writeMapFile( std::string & fileName )
1020 {
1021 // We need to re-index in order to put the labels in correct order:
1022 // 1. eps
1023 // 2. all slots
1024 // 3. all rules
1025 // 4. -pau- words
1026 // 5. remaining labels
1027 ofstream outfile;
1028 int index, origIndex;
1029 std::string label;
1030 std::string slotRuleName;
1031 std::string scope; // For rules
1032 HashMap<int,std::string> orderedList;
1033 int orderedIndex=0;
1034 // 1. eps
1035 orderedList.insert( orderedIndex++, "eps" );
1036
1037 // 2. slots
1038 if ( m_LabelList.getFirst( &origIndex, &label ) ) {
1039 if ( IsSlot( label ) ) {
1040 orderedList.insert( orderedIndex++, label );
1041 }
1042 while (m_LabelList.getNext( &origIndex, &label ) ) {
1043 if ( IsSlot( label ) ) {
1044 orderedList.insert( orderedIndex++, label );
1045 }
1046 }
1047 }
1048
1049 // 3. Now rules, or anything with @
1050 if ( m_LabelList.getFirst( &origIndex, &label ) ) {
1051 do {
1052 #if GRXML_DEBUG
1053 std::cout << label << " "<< label.find_first_of ("@") << std::endl;
1054 #endif
1055 if (!IsSlot(label) && label.find_first_of ("@") != string::npos) {
1056 #if GRXML_DEBUG
1057 std::cout << " Adding " << label << std::endl;
1058 #endif
1059 orderedList.insert( orderedIndex++, label );
1060 }
1061 } while (m_LabelList.getNext( &origIndex, &label ) );
1062 }
1063
1064 // 4. pau
1065 orderedList.insert( orderedIndex++, "-pau-" );
1066 orderedList.insert( orderedIndex++, "-pau2-" );
1067
1068 // 5. Remaining stuff. NB We depend upon the label not
1069 // being added twice.
1070 if ( m_LabelList.getFirst( &origIndex, &label ) ) {
1071 if ( !orderedList.getIndex( label, &index ) ) {
1072 orderedList.insert( orderedIndex++, label );
1073 }
1074 while (m_LabelList.getNext( &origIndex, &label ) ) {
1075 if ( !orderedList.getIndex( label, &index ) ) {
1076 orderedList.insert( orderedIndex++, label );
1077 }
1078 }
1079 }
1080 outfile.open ( fileName.c_str() );
1081
1082 bool bRes = orderedList.getFirst( &index, &label );
1083 do {
1084 if(!bRes) break;
1085 // Look up scope using original index
1086 m_LabelList.getIndex( label, &origIndex );
1087 if (m_RuleScope.getValue(origIndex, &scope) )
1088 label = scope + ":" + label;
1089 outfile << label << " " << index << std::endl;
1090 bRes = orderedList.getNext( &index, &label );
1091 } while(bRes);
1092
1093 outfile.close();
1094 }
1095
1096
writeScriptFile(std::string & fileName)1097 void GRXMLDoc::writeScriptFile( std::string & fileName )
1098 {
1099 ofstream outfile;
1100 int index;
1101 std::string label;
1102 outfile.open ( fileName.c_str() );
1103 if ( m_TagList.getFirst( &index, &label ) ) {
1104 outfile << index << " " << label << std::endl;
1105 }
1106 while (m_TagList.getNext( &index, &label ) ) {
1107 outfile << index << " " << label << std::endl;
1108 }
1109 outfile.close();
1110
1111 //m_LabelList.writeFile( fileName );
1112 }
1113
writeParamsFile(std::string & fileName)1114 void GRXMLDoc::writeParamsFile( std::string & fileName )
1115 {
1116 std::string wtw;
1117 ofstream outfile;
1118 bool bRes;
1119
1120 outfile.open(fileName.c_str());
1121
1122 std::string metaname = "word_penalty";
1123 bRes = findMeta(metaname, wtw);
1124 if(bRes)
1125 outfile << metaname.c_str() << "\t=\t" << wtw.c_str() << std::endl;
1126
1127 // outfile << "locale" << "\t=\t" << m_XMLLanguage << std::endl;
1128 outfile.close();
1129 }
1130
writeGraphFiles(std::string & prefix,bool bDoWriteRecogGraphs)1131 void GRXMLDoc::writeGraphFiles( std::string& prefix, bool bDoWriteRecogGraphs)
1132 {
1133 SubGraph *p_SubGraph;
1134 SubGraph *p_SemGraph;
1135 std::string fileName;
1136 if ( !findSubGraph( m_RootRule, p_SubGraph ) ) {
1137 FATAL_ERROR ("ERROR: writeGraphFiles - no root rule "+ m_RootRule + " defined. No file created", -1 );
1138 }
1139
1140 // Create .P.txt
1141 printf ("\nCreating semantic graph file\n");
1142 p_SemGraph = new SubGraph( (char *) "Main", -1);
1143 m_pGraph->BeginRule( p_SemGraph );
1144 m_pGraph->AddRuleRef( p_SemGraph, p_SubGraph->getRuleId());
1145 m_pGraph->EndRule( p_SemGraph );
1146 m_pGraph->ExpandRules (p_SemGraph);
1147 p_SemGraph->RemoveInternalConnections ();
1148
1149 p_SemGraph->AddTerminalConnections ();
1150 p_SemGraph->ReduceArcsByEquivalence();
1151 p_SemGraph->RemoveUnreachedConnections (-1, -1);
1152 p_SemGraph->DeterminizeArcs();
1153 p_SemGraph->RemoveUnreachedConnections (-1, -1);
1154 p_SemGraph->ReduceArcsByEquivalence();
1155 p_SemGraph->RemoveUnreachedConnections (-1, -1);
1156 fileName = prefix + ".P.txt";
1157 p_SemGraph->WriteForwardGraphWithSemantic( fileName, *this );
1158 delete p_SemGraph;
1159
1160 fileName = prefix + ".omap";
1161 this->WriteOLabels(fileName);
1162 }
1163
sortLabels()1164 void GRXMLDoc::sortLabels()
1165 {
1166 // We need to re-index in order to put the labels in correct order:
1167 int index=0, origIndex;
1168 std::string label;
1169 std::string slotRuleName;
1170 std::string scope; // For rules
1171 std::vector <std::string> orderedList;
1172 if ( m_LabelList.getFirst( &origIndex, &label ) ) {
1173 // Look up scope using original index
1174 orderedList.push_back( label );
1175 while (m_LabelList.getNext( &origIndex, &label ) ) {
1176 orderedList.push_back( label );
1177 }
1178 }
1179 std::sort(orderedList.begin(), orderedList.end() );
1180 m_SortedLabelList.clear();
1181 index=0;
1182 for (std::vector<std::string>::const_iterator citer = orderedList.begin();
1183 citer != orderedList.end(); ++citer) {
1184 label = *citer;
1185 m_LabelList.getIndex( label, &origIndex );
1186 m_SortedLabelList.insert( index, label );
1187 index++;
1188 // std::cout <<"Sorted: " << index <<" " << label <<std::endl;
1189 }
1190 return;
1191 }
1192
findSortedLabel(int i,std::string & s)1193 bool GRXMLDoc::findSortedLabel(int i, std::string &s )
1194 {
1195 if (m_SortedLabelList.isEmpty() ) {
1196 sortLabels(); // Create the sorted label list.
1197 }
1198 return m_SortedLabelList.getValue( i, &s );
1199 }
1200
findSortedLabelIndex(int i,int & sortedIndex)1201 bool GRXMLDoc::findSortedLabelIndex( int i, int &sortedIndex )
1202 {
1203 std::string s;
1204 if (m_SortedLabelList.isEmpty() ) {
1205 sortLabels(); // Create the sorted label list.
1206 }
1207 if ( m_LabelList.getValue( i, &s ) ) {
1208 if ( m_SortedLabelList.getIndex(s, &sortedIndex )) {
1209 return true;
1210 }
1211 }
1212 return false;
1213 }
1214
addOLabelToOList(std::string & s)1215 void GRXMLDoc::addOLabelToOList( std::string &s)
1216 {
1217 m_OutputPtxtLabels.insert( s, 0);
1218 }
1219
WriteOLabels(const std::string & fileName)1220 bool GRXMLDoc::WriteOLabels(const std::string& fileName)
1221 {
1222 HashMap<int,std::string> invMap;
1223 int count = 0;
1224 int max_script_label = 0;
1225 int scriptID = 0;
1226 std::map<std::string, int>::iterator iter;
1227 bool bFound;
1228 int tmp;
1229
1230 std::string strIndex = "eps";
1231 bFound = m_OutputPtxtLabels.getValue(strIndex, &tmp);
1232 if(bFound)
1233 m_OutputPtxtLabels.remove(strIndex);
1234 m_OutputPtxtLabels.insert(strIndex, count);
1235 invMap.insert( count, strIndex);
1236 count++;
1237
1238 strIndex = "{";
1239 bFound = m_OutputPtxtLabels.getValue(strIndex, &tmp);
1240 if(bFound)
1241 m_OutputPtxtLabels.remove(strIndex);
1242 m_OutputPtxtLabels.insert(strIndex, count);
1243 invMap.insert( count, strIndex);
1244 count++;
1245
1246 iter = m_OutputPtxtLabels.begin();
1247 for( ; iter!=m_OutputPtxtLabels.end(); iter++) {
1248 const char* label = iter->first.c_str();
1249 if( !strncmp(label,SCRIPT_LABEL_PREFIX, SCRIPT_LABEL_PREFIX_LEN)
1250 && strspn(label+SCRIPT_LABEL_PREFIX_LEN,"0123456789")==strlen(label+SCRIPT_LABEL_PREFIX_LEN) ) {
1251 scriptID = atoi(label+SCRIPT_LABEL_PREFIX_LEN);
1252 if(max_script_label < scriptID)
1253 max_script_label = scriptID;
1254 }/* else if( !strncmp(label,SCRIPT_LABEL_PREFIX, SCRIPT_LABEL_PREFIX_LEN)) {
1255 invMap.insert(count, iter->first);
1256 iter->second = count;
1257 count++;
1258 }*/
1259 else if(!invMap.getIndex((iter->first), &tmp)){
1260 invMap.insert(count, iter->first);
1261 iter->second = count;
1262 count++;
1263 }
1264 }
1265
1266 cout << "found max_script_label " << max_script_label << endl;
1267 for(int j=0; j<=max_script_label; j++) {
1268 std::stringstream ss;
1269 ss << SCRIPT_LABEL_PREFIX << j;
1270 if(!invMap.getIndex( ss.str(), &tmp)) {
1271 invMap.insert( count++, ss.str());
1272 }
1273 }
1274
1275 std::ofstream outfile(fileName.c_str());
1276 std::string outscript;
1277 if(!outfile) {
1278 FATAL_ERROR( "Error: opening the omap file for output", 1);
1279 WARNING( "Error: opening the omap file for output");
1280 return 1;
1281 }
1282 for(int i=0; i<count; i++) {
1283 outscript = "";
1284 invMap.getValue(i,&outscript);
1285 if(outscript.length() == 0) {
1286 cout << "error: internal error while making .omap " << i << endl;
1287 FATAL_ERROR("error",1);
1288 }
1289 outfile << outscript.c_str() << " " << i << std::endl;
1290 }
1291 outfile.close();
1292 return 0;
1293 }
1294