Source code for corpustools.format.tmxparser

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# License: FreeBSD License or The BSD 2-Clause License

# Copyright (c) 2012, Leo Jiang
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:

#     Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#     Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

# Author: Leo Jiang <leo.jiang.dev@gmail.com>

# pylint: disable=I0011,C0301,C0103,R0902,E0202

"""TMX Parser Module"""

import codecs
import os.path
from xml.parsers.expat import ParserCreate
from xml.parsers.expat import ExpatError

from corpustools.lib.languagecode import LanguageCode

[docs]class TMXParser(object): """TMXParser read TMX file and extract the specified languages sentence align. This tmx parser use xml.parsers.expat as xml parser engine. """ def __init__(self): self.source_lang = None self.target_lang = None self.source_filepath = None self.target_filepath = None self.source = [] self.target = [] self._output_dir = None self.source_fp = None self.target_fp = None # The flag 'in_seg' indicate whether current event is happened in seg. # Because some tags will be embedded in sentence, we have to keep these tags in seg. self.in_seg = False self.tuv_lang = None self.seg = None self.parser = ParserCreate() self.parser.buffer_text = True self.parser.buffer_size = 4096 self.parser.returns_unicode = True self.parser.StartElementHandler = self.start_element_handler self.parser.EndElementHandler = self.end_element_handler self.parser.CharacterDataHandler = self.char_data_handler @property def output_dir(self): return self._output_dir @output_dir.setter def output_dir(self, path): if os.path.isdir(path): self._output_dir = path
[docs] def parse_file(self, filename, source_lang, target_lang): """Expat parser callback function.""" # open the txm file, needn't to specify the encoding. try: fp = open(filename, 'r') except IOError as e: print e return e.errno if self.output_dir is None: self.output_dir = os.path.dirname(filename) stem = os.path.splitext(os.path.basename(filename))[0] self.source_lang = LanguageCode(source_lang).TMX_form() self.target_lang = LanguageCode(target_lang).TMX_form() self.source_filepath = os.path.join(self.output_dir, stem + '.' + LanguageCode(source_lang).xx()) self.target_filepath = os.path.join(self.output_dir, stem + '.' + LanguageCode(target_lang).xx()) try: self.source_fp = codecs.open(self.source_filepath, 'w', 'utf-8') self.target_fp = codecs.open(self.target_filepath, 'w', 'utf-8') except IOError as e: print e return e.errno # whether success or fail, close the files and quit. try: self.parser.ParseFile(fp) except ExpatError as e: print "[Error] {0}:".format(os.path.basename(filename)), e return e.code finally: fp.close() self.source_fp.close() self.target_fp.close() return 0
[docs] def start_element_handler(self, name, attributes): """Expat parser callback function.""" if (self.in_seg): attrlist = [ attrname + "=" + '"'+ attributes[attrname] +'"' for attrname in attributes.keys()] attrstr = " ".join(attrlist) tagheader = "<" + name + " " + attrstr + ">" self.seg += tagheader if (name == u"tu"): self.source = [] self.target = [] if (name == u"tuv"): self.tuv_lang = attributes["xml:lang"] if (name == u"seg"): self.seg = u"" self.in_seg = True
[docs] def end_element_handler(self, name): """Expat parser callback function.""" if (name == u"tu"): source = u' '.join(self.source).strip() target = u' '.join(self.target).strip() if len(source) > 0 and len(target) > 0: self.source_fp.write(source + os.linesep) self.target_fp.write(target + os.linesep) if (name == u"tuv"): self.tuv_lang = None if (name == u"seg"): self.in_seg = False if self.tuv_lang == self.source_lang: self.source.extend(self.seg.splitlines()) if self.tuv_lang == self.target_lang: self.target.extend(self.seg.splitlines()) self.seg = u"" if (self.in_seg): tagtail = "</" + name + ">" self.seg += tagtail
[docs] def char_data_handler(self, data): """Expat parser callback function.""" if self.in_seg: self.seg += data