• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# Copyright (c) 2012 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6'''A gatherer for the TotalRecall brand of HTML templates with replaceable
7portions.  We wanted to reuse extern.tclib.api.handlers.html.TCHTMLParser
8but this proved impossible due to the fact that the TotalRecall HTML templates
9are in general quite far from parseable HTML and the TCHTMLParser derives
10from HTMLParser.HTMLParser which requires relatively well-formed HTML.  Some
11examples of "HTML" from the TotalRecall HTML templates that wouldn't be
12parseable include things like:
13
14  <a [PARAMS]>blabla</a>  (not parseable because attributes are invalid)
15
16  <table><tr><td>[LOTSOFSTUFF]</tr></table> (not parseable because closing
17                                            </td> is in the HTML [LOTSOFSTUFF]
18                                            is replaced by)
19
20The other problem with using general parsers (such as TCHTMLParser) is that
21we want to make sure we output the TotalRecall template with as little changes
22as possible in terms of whitespace characters, layout etc.  With any parser
23that generates a parse tree, and generates output by dumping the parse tree,
24we would always have little inconsistencies which could cause bugs (the
25TotalRecall template stuff is quite brittle and can break if e.g. a tab
26character is replaced with spaces).
27
28The solution, which may be applicable to some other HTML-like template
29languages floating around Google, is to create a parser with a simple state
30machine that keeps track of what kind of tag it's inside, and whether it's in
31a translateable section or not.  Translateable sections are:
32
33a) text (including [BINGO] replaceables) inside of tags that
34   can contain translateable text (which is all tags except
35   for a few)
36
37b) text inside of an 'alt' attribute in an <image> element, or
38   the 'value' attribute of a <submit>, <button> or <text>
39   element.
40
41The parser does not build up a parse tree but rather a "skeleton" which
42is a list of nontranslateable strings intermingled with grit.clique.MessageClique
43objects.  This simplifies the parser considerably compared to a regular HTML
44parser.  To output a translated document, each item in the skeleton is
45printed out, with the relevant Translation from each MessageCliques being used
46for the requested language.
47
48This implementation borrows some code, constants and ideas from
49extern.tclib.api.handlers.html.TCHTMLParser.
50'''
51
52
53import re
54import types
55
56from grit import clique
57from grit import exception
58from grit import lazy_re
59from grit import util
60from grit import tclib
61
62from grit.gather import interface
63
64
65# HTML tags which break (separate) chunks.
66_BLOCK_TAGS = ['script', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'br',
67              'body', 'style', 'head', 'title', 'table', 'tr', 'td', 'th',
68              'ul', 'ol', 'dl', 'nl', 'li', 'div', 'object', 'center',
69              'html', 'link', 'form', 'select', 'textarea',
70              'button', 'option', 'map', 'area', 'blockquote', 'pre',
71              'meta', 'xmp', 'noscript', 'label', 'tbody', 'thead',
72              'script', 'style', 'pre', 'iframe', 'img', 'input', 'nowrap',
73              'fieldset', 'legend']
74
75# HTML tags which may appear within a chunk.
76_INLINE_TAGS = ['b', 'i', 'u', 'tt', 'code', 'font', 'a', 'span', 'small',
77               'key', 'nobr', 'url', 'em', 's', 'sup', 'strike',
78               'strong']
79
80# HTML tags within which linebreaks are significant.
81_PREFORMATTED_TAGS = ['textarea', 'xmp', 'pre']
82
83# An array mapping some of the inline HTML tags to more meaningful
84# names for those tags.  This will be used when generating placeholders
85# representing these tags.
86_HTML_PLACEHOLDER_NAMES = { 'a' : 'link', 'br' : 'break', 'b' : 'bold',
87  'i' : 'italic', 'li' : 'item', 'ol' : 'ordered_list', 'p' : 'paragraph',
88  'ul' : 'unordered_list', 'img' : 'image', 'em' : 'emphasis' }
89
90# We append each of these characters in sequence to distinguish between
91# different placeholders with basically the same name (e.g. BOLD1, BOLD2).
92# Keep in mind that a placeholder name must not be a substring of any other
93# placeholder name in the same message, so we can't simply count (BOLD_1
94# would be a substring of BOLD_10).
95_SUFFIXES = '123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
96
97# Matches whitespace in an HTML document.  Also matches HTML comments, which are
98# treated as whitespace.
99_WHITESPACE = lazy_re.compile(r'(\s|&nbsp;|\\n|\\r|<!--\s*desc\s*=.*?-->)+',
100                              re.DOTALL)
101
102# Matches whitespace sequences which can be folded into a single whitespace
103# character.  This matches single characters so that non-spaces are replaced
104# with spaces.
105_FOLD_WHITESPACE = lazy_re.compile(r'\s+')
106
107# Finds a non-whitespace character
108_NON_WHITESPACE = lazy_re.compile(r'\S')
109
110# Matches two or more &nbsp; in a row (a single &nbsp is not changed into
111# placeholders because different languages require different numbers of spaces
112# and placeholders must match exactly; more than one is probably a "special"
113# whitespace sequence and should be turned into a placeholder).
114_NBSP = lazy_re.compile(r'&nbsp;(&nbsp;)+')
115
116# Matches nontranslateable chunks of the document
117_NONTRANSLATEABLES = lazy_re.compile(r'''
118  <\s*script.+?<\s*/\s*script\s*>
119  |
120  <\s*style.+?<\s*/\s*style\s*>
121  |
122  <!--.+?-->
123  |
124  <\?IMPORT\s.+?>           # import tag
125  |
126  <\s*[a-zA-Z_]+:.+?>       # custom tag (open)
127  |
128  <\s*/\s*[a-zA-Z_]+:.+?>   # custom tag (close)
129  |
130  <!\s*[A-Z]+\s*([^>]+|"[^"]+"|'[^']+')*?>
131  ''', re.MULTILINE | re.DOTALL | re.VERBOSE | re.IGNORECASE)
132
133# Matches a tag and its attributes
134_ELEMENT = lazy_re.compile(r'''
135  # Optional closing /, element name
136  <\s*(?P<closing>/)?\s*(?P<element>[a-zA-Z0-9]+)\s*
137  # Attributes and/or replaceables inside the tag, if any
138  (?P<atts>(
139    \s*([a-zA-Z_][-:.a-zA-Z_0-9]*) # Attribute name
140    (\s*=\s*(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?
141    |
142    \s*\[(\$?\~)?([A-Z0-9-_]+?)(\~\$?)?\]
143  )*)
144  \s*(?P<empty>/)?\s*> # Optional empty-tag closing /, and tag close
145  ''',
146  re.MULTILINE | re.DOTALL | re.VERBOSE)
147
148# Matches elements that may have translateable attributes.  The value of these
149# special attributes is given by group 'value1' or 'value2'.  Note that this
150# regexp demands that the attribute value be quoted; this is necessary because
151# the non-tree-building nature of the parser means we don't know when we're
152# writing out attributes, so we wouldn't know to escape spaces.
153_SPECIAL_ELEMENT = lazy_re.compile(r'''
154  <\s*(
155    input[^>]+?value\s*=\s*(\'(?P<value3>[^\']*)\'|"(?P<value4>[^"]*)")
156    [^>]+type\s*=\s*"?'?(button|reset|text|submit)'?"?
157    |
158    (
159      table[^>]+?title\s*=
160      |
161      img[^>]+?alt\s*=
162      |
163      input[^>]+?type\s*=\s*"?'?(button|reset|text|submit)'?"?[^>]+?value\s*=
164    )
165    \s*(\'(?P<value1>[^\']*)\'|"(?P<value2>[^"]*)")
166  )[^>]*?>
167  ''', re.MULTILINE | re.DOTALL | re.VERBOSE | re.IGNORECASE)
168
169# Matches stuff that is translateable if it occurs in the right context
170# (between tags).  This includes all characters and character entities.
171# Note that this also matches &nbsp; which needs to be handled as whitespace
172# before this regexp is applied.
173_CHARACTERS = lazy_re.compile(r'''
174  (
175    \w
176    |
177    [\!\@\#\$\%\^\*\(\)\-\=\_\+\[\]\{\}\\\|\;\:\'\"\,\.\/\?\`\~]
178    |
179    &(\#[0-9]+|\#x[0-9a-fA-F]+|[A-Za-z0-9]+);
180  )+
181  ''', re.MULTILINE | re.DOTALL | re.VERBOSE)
182
183# Matches Total Recall's "replaceable" tags, which are just any text
184# in capitals enclosed by delimiters like [] or [~~] or [$~~$] (e.g. [HELLO],
185# [~HELLO~] and [$~HELLO~$]).
186_REPLACEABLE = lazy_re.compile(r'\[(\$?\~)?(?P<name>[A-Z0-9-_]+?)(\~\$?)?\]',
187                               re.MULTILINE)
188
189
190# Matches the silly [!]-prefixed "header" that is used in some TotalRecall
191# templates.
192_SILLY_HEADER = lazy_re.compile(r'\[!\]\ntitle\t(?P<title>[^\n]+?)\n.+?\n\n',
193                                re.MULTILINE | re.DOTALL)
194
195
196# Matches a comment that provides a description for the message it occurs in.
197_DESCRIPTION_COMMENT = lazy_re.compile(
198  r'<!--\s*desc\s*=\s*(?P<description>.+?)\s*-->', re.DOTALL)
199
200# Matches a comment which is used to break apart multiple messages.
201_MESSAGE_BREAK_COMMENT = lazy_re.compile(r'<!--\s*message-break\s*-->',
202                                         re.DOTALL)
203
204# Matches a comment which is used to prevent block tags from splitting a message
205_MESSAGE_NO_BREAK_COMMENT = re.compile(r'<!--\s*message-no-break\s*-->',
206                                       re.DOTALL)
207
208
209_DEBUG = 0
210def _DebugPrint(text):
211  if _DEBUG:
212    print text.encode('utf-8')
213
214
215class HtmlChunks(object):
216  '''A parser that knows how to break an HTML-like document into a list of
217  chunks, where each chunk is either translateable or non-translateable.
218  The chunks are unmodified sections of the original document, so concatenating
219  the text of all chunks would result in the original document.'''
220
221  def InTranslateable(self):
222    return self.last_translateable != -1
223
224  def Rest(self):
225    return self.text_[self.current:]
226
227  def StartTranslateable(self):
228    assert not self.InTranslateable()
229    if self.current != 0:
230      # Append a nontranslateable chunk
231      chunk_text = self.text_[self.chunk_start : self.last_nontranslateable + 1]
232      # Needed in the case where document starts with a translateable.
233      if len(chunk_text) > 0:
234        self.AddChunk(False, chunk_text)
235    self.chunk_start = self.last_nontranslateable + 1
236    self.last_translateable = self.current
237    self.last_nontranslateable = -1
238
239  def EndTranslateable(self):
240    assert self.InTranslateable()
241    # Append a translateable chunk
242    self.AddChunk(True,
243                  self.text_[self.chunk_start : self.last_translateable + 1])
244    self.chunk_start = self.last_translateable + 1
245    self.last_translateable = -1
246    self.last_nontranslateable = self.current
247
248  def AdvancePast(self, match):
249    self.current += match.end()
250
251  def AddChunk(self, translateable, text):
252    '''Adds a chunk to self, removing linebreaks and duplicate whitespace
253    if appropriate.
254    '''
255    m = _DESCRIPTION_COMMENT.search(text)
256    if m:
257      self.last_description = m.group('description')
258      # Remove the description from the output text
259      text = _DESCRIPTION_COMMENT.sub('', text)
260
261    m = _MESSAGE_BREAK_COMMENT.search(text)
262    if m:
263      # Remove the coment from the output text.  It should already effectively
264      # break apart messages.
265      text = _MESSAGE_BREAK_COMMENT.sub('', text)
266
267    if translateable and not self.last_element_ in _PREFORMATTED_TAGS:
268      if self.fold_whitespace_:
269        # Fold whitespace sequences if appropriate.  This is optional because it
270        # alters the output strings.
271        text = _FOLD_WHITESPACE.sub(' ', text)
272      else:
273        text = text.replace('\n', ' ')
274        text = text.replace('\r', ' ')
275        # This whitespace folding doesn't work in all cases, thus the
276        # fold_whitespace flag to support backwards compatibility.
277        text = text.replace('   ', ' ')
278        text = text.replace('  ', ' ')
279
280    if translateable:
281      description = self.last_description
282      self.last_description = ''
283    else:
284      description = ''
285
286    if text != '':
287      self.chunks_.append((translateable, text, description))
288
289  def Parse(self, text, fold_whitespace):
290    '''Parses self.text_ into an intermediate format stored in self.chunks_
291    which is translateable and nontranslateable chunks.  Also returns
292    self.chunks_
293
294    Args:
295      text: The HTML for parsing.
296      fold_whitespace: Whether whitespace sequences should be folded into a
297        single space.
298
299    Return:
300      [chunk1, chunk2, chunk3, ...]  (instances of class Chunk)
301    '''
302    #
303    # Chunker state
304    #
305
306    self.text_ = text
307    self.fold_whitespace_ = fold_whitespace
308
309    # A list of tuples (is_translateable, text) which represents the document
310    # after chunking.
311    self.chunks_ = []
312
313    # Start index of the last chunk, whether translateable or not
314    self.chunk_start = 0
315
316    # Index of the last for-sure translateable character if we are parsing
317    # a translateable chunk, -1 to indicate we are not in a translateable chunk.
318    # This is needed so that we don't include trailing whitespace in the
319    # translateable chunk (whitespace is neutral).
320    self.last_translateable = -1
321
322    # Index of the last for-sure nontranslateable character if we are parsing
323    # a nontranslateable chunk, -1 if we are not in a nontranslateable chunk.
324    # This is needed to make sure we can group e.g. "<b>Hello</b> there"
325    # together instead of just "Hello</b> there" which would be much worse
326    # for translation.
327    self.last_nontranslateable = -1
328
329    # Index of the character we're currently looking at.
330    self.current = 0
331
332    # The name of the last block element parsed.
333    self.last_element_ = ''
334
335    # The last explicit description we found.
336    self.last_description = ''
337
338    # Whether no-break was the last chunk seen
339    self.last_nobreak = False
340
341    while self.current < len(self.text_):
342      _DebugPrint('REST: %s' % self.text_[self.current:self.current+60])
343
344      m = _MESSAGE_NO_BREAK_COMMENT.match(self.Rest())
345      if m:
346        self.AdvancePast(m)
347        self.last_nobreak = True
348        continue
349
350      # Try to match whitespace
351      m = _WHITESPACE.match(self.Rest())
352      if m:
353        # Whitespace is neutral, it just advances 'current' and does not switch
354        # between translateable/nontranslateable.  If we are in a
355        # nontranslateable section that extends to the current point, we extend
356        # it to include the whitespace.  If we are in a translateable section,
357        # we do not extend it until we find
358        # more translateable parts, because we never want a translateable chunk
359        # to end with whitespace.
360        if (not self.InTranslateable() and
361            self.last_nontranslateable == self.current - 1):
362          self.last_nontranslateable = self.current + m.end() - 1
363        self.AdvancePast(m)
364        continue
365
366      # Then we try to match nontranslateables
367      m = _NONTRANSLATEABLES.match(self.Rest())
368      if m:
369        if self.InTranslateable():
370          self.EndTranslateable()
371        self.last_nontranslateable = self.current + m.end() - 1
372        self.AdvancePast(m)
373        continue
374
375      # Now match all other HTML element tags (opening, closing, or empty, we
376      # don't care).
377      m = _ELEMENT.match(self.Rest())
378      if m:
379        element_name = m.group('element').lower()
380        if element_name in _BLOCK_TAGS:
381          self.last_element_ = element_name
382          if self.InTranslateable():
383            if self.last_nobreak:
384              self.last_nobreak = False
385            else:
386              self.EndTranslateable()
387
388          # Check for "special" elements, i.e. ones that have a translateable
389          # attribute, and handle them correctly.  Note that all of the
390          # "special" elements are block tags, so no need to check for this
391          # if the tag is not a block tag.
392          sm = _SPECIAL_ELEMENT.match(self.Rest())
393          if sm:
394            # Get the appropriate group name
395            for group in sm.groupdict().keys():
396              if sm.groupdict()[group]:
397                break
398
399            # First make a nontranslateable chunk up to and including the
400            # quote before the translateable attribute value
401            self.AddChunk(False, self.text_[
402              self.chunk_start : self.current + sm.start(group)])
403            # Then a translateable for the translateable bit
404            self.AddChunk(True, self.Rest()[sm.start(group) : sm.end(group)])
405            # Finally correct the data invariant for the parser
406            self.chunk_start = self.current + sm.end(group)
407
408          self.last_nontranslateable = self.current + m.end() - 1
409        elif self.InTranslateable():
410          # We're in a translateable and the tag is an inline tag, so we
411          # need to include it in the translateable.
412          self.last_translateable = self.current + m.end() - 1
413        self.AdvancePast(m)
414        continue
415
416      # Anything else we find must be translateable, so we advance one character
417      # at a time until one of the above matches.
418      if not self.InTranslateable():
419        self.StartTranslateable()
420      else:
421        self.last_translateable = self.current
422      self.current += 1
423
424    # Close the final chunk
425    if self.InTranslateable():
426      self.AddChunk(True, self.text_[self.chunk_start : ])
427    else:
428      self.AddChunk(False, self.text_[self.chunk_start : ])
429
430    return self.chunks_
431
432
433def HtmlToMessage(html, include_block_tags=False, description=''):
434  '''Takes a bit of HTML, which must contain only "inline" HTML elements,
435  and changes it into a tclib.Message.  This involves escaping any entities and
436  replacing any HTML code with placeholders.
437
438  If include_block_tags is true, no error will be given if block tags (e.g.
439  <p> or <br>) are included in the HTML.
440
441  Args:
442    html: 'Hello <b>[USERNAME]</b>, how&nbsp;<i>are</i> you?'
443    include_block_tags: False
444
445  Return:
446    tclib.Message('Hello START_BOLD1USERNAMEEND_BOLD, '
447                  'howNBSPSTART_ITALICareEND_ITALIC you?',
448                  [ Placeholder('START_BOLD', '<b>', ''),
449                    Placeholder('USERNAME', '[USERNAME]', ''),
450                    Placeholder('END_BOLD', '</b>', ''),
451                    Placeholder('START_ITALIC', '<i>', ''),
452                    Placeholder('END_ITALIC', '</i>', ''), ])
453  '''
454  # Approach is:
455  # - first placeholderize, finding <elements>, [REPLACEABLES] and &nbsp;
456  # - then escape all character entities in text in-between placeholders
457
458  parts = []  # List of strings (for text chunks) and tuples (ID, original)
459              # for placeholders
460
461  count_names = {}  # Map of base names to number of times used
462  end_names = {}  # Map of base names to stack of end tags (for correct nesting)
463
464  def MakeNameClosure(base, type = ''):
465    '''Returns a closure that can be called once all names have been allocated
466    to return the final name of the placeholder.  This allows us to minimally
467    number placeholders for non-overlap.
468
469    Also ensures that END_XXX_Y placeholders have the same Y as the
470    corresponding BEGIN_XXX_Y placeholder when we have nested tags of the same
471    type.
472
473    Args:
474      base: 'phname'
475      type: '' | 'begin' | 'end'
476
477    Return:
478      Closure()
479    '''
480    name = base.upper()
481    if type != '':
482      name = ('%s_%s' % (type, base)).upper()
483
484    if name in count_names.keys():
485      count_names[name] += 1
486    else:
487      count_names[name] = 1
488
489    def MakeFinalName(name_ = name, index = count_names[name] - 1):
490      if (type.lower() == 'end' and
491          base in end_names.keys() and len(end_names[base])):
492        return end_names[base].pop(-1)  # For correct nesting
493      if count_names[name_] != 1:
494        name_ = '%s_%s' % (name_, _SUFFIXES[index])
495        # We need to use a stack to ensure that the end-tag suffixes match
496        # the begin-tag suffixes.  Only needed when more than one tag of the
497        # same type.
498        if type == 'begin':
499          end_name = ('END_%s_%s' % (base, _SUFFIXES[index])).upper()
500          if base in end_names.keys():
501            end_names[base].append(end_name)
502          else:
503            end_names[base] = [end_name]
504
505      return name_
506
507    return MakeFinalName
508
509  current = 0
510  last_nobreak = False
511
512  while current < len(html):
513    m = _MESSAGE_NO_BREAK_COMMENT.match(html[current:])
514    if m:
515      last_nobreak = True
516      current += m.end()
517      continue
518
519    m = _NBSP.match(html[current:])
520    if m:
521      parts.append((MakeNameClosure('SPACE'), m.group()))
522      current += m.end()
523      continue
524
525    m = _REPLACEABLE.match(html[current:])
526    if m:
527      # Replaceables allow - but placeholders don't, so replace - with _
528      ph_name = MakeNameClosure('X_%s_X' % m.group('name').replace('-', '_'))
529      parts.append((ph_name, m.group()))
530      current += m.end()
531      continue
532
533    m = _SPECIAL_ELEMENT.match(html[current:])
534    if m:
535      if not include_block_tags:
536        if last_nobreak:
537          last_nobreak = False
538        else:
539          raise exception.BlockTagInTranslateableChunk(html)
540      element_name = 'block'  # for simplification
541      # Get the appropriate group name
542      for group in m.groupdict().keys():
543        if m.groupdict()[group]:
544          break
545      parts.append((MakeNameClosure(element_name, 'begin'),
546                    html[current : current + m.start(group)]))
547      parts.append(m.group(group))
548      parts.append((MakeNameClosure(element_name, 'end'),
549                    html[current + m.end(group) : current + m.end()]))
550      current += m.end()
551      continue
552
553    m = _ELEMENT.match(html[current:])
554    if m:
555      element_name = m.group('element').lower()
556      if not include_block_tags and not element_name in _INLINE_TAGS:
557        if last_nobreak:
558          last_nobreak = False
559        else:
560          raise exception.BlockTagInTranslateableChunk(html[current:])
561      if element_name in _HTML_PLACEHOLDER_NAMES:  # use meaningful names
562        element_name = _HTML_PLACEHOLDER_NAMES[element_name]
563
564      # Make a name for the placeholder
565      type = ''
566      if not m.group('empty'):
567        if m.group('closing'):
568          type = 'end'
569        else:
570          type = 'begin'
571      parts.append((MakeNameClosure(element_name, type), m.group()))
572      current += m.end()
573      continue
574
575    if len(parts) and isinstance(parts[-1], types.StringTypes):
576      parts[-1] += html[current]
577    else:
578      parts.append(html[current])
579    current += 1
580
581  msg_text = ''
582  placeholders = []
583  for part in parts:
584    if isinstance(part, types.TupleType):
585      final_name = part[0]()
586      original = part[1]
587      msg_text += final_name
588      placeholders.append(tclib.Placeholder(final_name, original, '(HTML code)'))
589    else:
590      msg_text += part
591
592  msg = tclib.Message(text=msg_text, placeholders=placeholders,
593                      description=description)
594  content = msg.GetContent()
595  for ix in range(len(content)):
596    if isinstance(content[ix], types.StringTypes):
597      content[ix] = util.UnescapeHtml(content[ix], replace_nbsp=False)
598
599  return msg
600
601
602class TrHtml(interface.GathererBase):
603  '''Represents a document or message in the template format used by
604  Total Recall for HTML documents.'''
605
606  def __init__(self, *args, **kwargs):
607    super(TrHtml, self).__init__(*args, **kwargs)
608    self.have_parsed_ = False
609    self.skeleton_ = []  # list of strings and MessageClique objects
610    self.fold_whitespace_ = False
611
612  def SetAttributes(self, attrs):
613    '''Sets node attributes used by the gatherer.
614
615    This checks the fold_whitespace attribute.
616
617    Args:
618      attrs: The mapping of node attributes.
619    '''
620    self.fold_whitespace_ = ('fold_whitespace' in attrs and
621                             attrs['fold_whitespace'] == 'true')
622
623  def GetText(self):
624    '''Returns the original text of the HTML document'''
625    return self.text_
626
627  def GetTextualIds(self):
628    return [self.extkey]
629
630  def GetCliques(self):
631    '''Returns the message cliques for each translateable message in the
632    document.'''
633    return [x for x in self.skeleton_ if isinstance(x, clique.MessageClique)]
634
635  def Translate(self, lang, pseudo_if_not_available=True,
636                skeleton_gatherer=None, fallback_to_english=False):
637    '''Returns this document with translateable messages filled with
638    the translation for language 'lang'.
639
640    Args:
641      lang: 'en'
642      pseudo_if_not_available: True
643
644    Return:
645      'ID_THIS_SECTION TYPE\n...BEGIN\n  "Translated message"\n......\nEND
646
647    Raises:
648      grit.exception.NotReady() if used before Parse() has been successfully
649      called.
650      grit.exception.NoSuchTranslation() if 'pseudo_if_not_available' is false
651      and there is no translation for the requested language.
652    '''
653    if len(self.skeleton_) == 0:
654      raise exception.NotReady()
655
656    # TODO(joi) Implement support for skeleton gatherers here.
657
658    out = []
659    for item in self.skeleton_:
660      if isinstance(item, types.StringTypes):
661        out.append(item)
662      else:
663        msg = item.MessageForLanguage(lang,
664                                      pseudo_if_not_available,
665                                      fallback_to_english)
666        for content in msg.GetContent():
667          if isinstance(content, tclib.Placeholder):
668            out.append(content.GetOriginal())
669          else:
670            # We escape " characters to increase the chance that attributes
671            # will be properly escaped.
672            out.append(util.EscapeHtml(content, True))
673
674    return ''.join(out)
675
676  def Parse(self):
677    if self.have_parsed_:
678      return
679    self.have_parsed_ = True
680
681    text = self._LoadInputFile()
682
683    # Ignore the BOM character if the document starts with one.
684    if text.startswith(u'\ufeff'):
685      text = text[1:]
686
687    self.text_ = text
688
689    # Parsing is done in two phases:  First, we break the document into
690    # translateable and nontranslateable chunks.  Second, we run through each
691    # translateable chunk and insert placeholders for any HTML elements,
692    # unescape escaped characters, etc.
693
694    # First handle the silly little [!]-prefixed header because it's not
695    # handled by our HTML parsers.
696    m = _SILLY_HEADER.match(text)
697    if m:
698      self.skeleton_.append(text[:m.start('title')])
699      self.skeleton_.append(self.uberclique.MakeClique(
700        tclib.Message(text=text[m.start('title'):m.end('title')])))
701      self.skeleton_.append(text[m.end('title') : m.end()])
702      text = text[m.end():]
703
704    chunks = HtmlChunks().Parse(text, self.fold_whitespace_)
705
706    for chunk in chunks:
707      if chunk[0]:  # Chunk is translateable
708        self.skeleton_.append(self.uberclique.MakeClique(
709          HtmlToMessage(chunk[1], description=chunk[2])))
710      else:
711        self.skeleton_.append(chunk[1])
712
713    # Go through the skeleton and change any messages that consist solely of
714    # placeholders and whitespace into nontranslateable strings.
715    for ix in range(len(self.skeleton_)):
716      got_text = False
717      if isinstance(self.skeleton_[ix], clique.MessageClique):
718        msg = self.skeleton_[ix].GetMessage()
719        for item in msg.GetContent():
720          if (isinstance(item, types.StringTypes) and _NON_WHITESPACE.search(item)
721              and item != '&nbsp;'):
722            got_text = True
723            break
724        if not got_text:
725          self.skeleton_[ix] = msg.GetRealContent()
726
727  def SubstituteMessages(self, substituter):
728    '''Applies substitutions to all messages in the tree.
729
730    Goes through the skeleton and finds all MessageCliques.
731
732    Args:
733      substituter: a grit.util.Substituter object.
734    '''
735    new_skel = []
736    for chunk in self.skeleton_:
737      if isinstance(chunk, clique.MessageClique):
738        old_message = chunk.GetMessage()
739        new_message = substituter.SubstituteMessage(old_message)
740        if new_message is not old_message:
741          new_skel.append(self.uberclique.MakeClique(new_message))
742          continue
743      new_skel.append(chunk)
744    self.skeleton_ = new_skel
745
746