Source code for streamparser

#!/usr/bin/env python3
# coding=utf-8
"""
Usage: streamparser.py [FILE]

Consumes input from a file (first argument) or stdin, parsing and pretty printing the readings of lexical units found.
"""

__all__ = [
    'Knownness', 'known', 'unknown', 'biunknown', 'genunknown', 'LexicalUnit', 'SReading',
    'subreading_to_string', 'reading_to_string', 'mainpos', 'parse', 'parse_file',
]
__author__ = 'Sushain K. Cherivirala, Kevin Brubeck Unhammer'
__copyright__ = 'Copyright 2016--2018, Sushain K. Cherivirala, Kevin Brubeck Unhammer'
__credits__ = ['Sushain K. Cherivirala', 'Kevin Brubeck Unhammer']
__license__ = 'GPLv3+'
__status__ = 'Production'
__version__ = '5.0.2'

import fileinput
import functools
import itertools
import pprint
import re
import warnings
from collections import namedtuple

if False:
    from typing import Type, List, Tuple, Iterator, Iterable, Generator, Union  # noqa: F401


[docs]class Knownness: """Level of knowledge associated with a :class:`LexicalUnit`. \n Values: :class:`known`, :class:`unknown`, :class:`biunknown`, :class:`genunknown` """ symbol = ''
[docs]class known(Knownness): # noqa: N801 pass
[docs]class unknown(Knownness): # noqa: N801 """Denoted by ``*``, analysis not available.""" symbol = '*'
[docs]class biunknown(Knownness): # noqa: N801 """Denoted by ``@``, translation not available.""" symbol = '@'
[docs]class genunknown(Knownness): # noqa: N801 """Denoted by ``#``, generated form not available.""" symbol = '#'
def _symbol_to_knownness(symbol): # type: (str) -> Type[Knownness] return {'*': unknown, '@': biunknown, '#': genunknown}.get(symbol, known) SReading = namedtuple('SReading', ['baseform', 'tags']) SReading.__doc__ = """A single subreading of an analysis of a token. Attributes: baseform (str): The base form (lemma, lexical form, citation form) of the reading. tags (List[str]): The morphological tags associated with the reading. """ def subreading_to_string(sub): # type: (SReading) -> str return sub.baseform + ''.join('<' + t + '>' for t in sub.tags) # type: ignore def reading_to_string(reading): # type: (List[SReading]) -> str return '+'.join(subreading_to_string(sub) for sub in reading)
[docs]def mainpos(reading, ltr=False): # type: (SReading, bool) -> str """Return the first part-of-speech tag of a reading. If there are several subreadings, by default give the first tag of the last subreading. If ltr=True, give the first tag of the first subreading, see http://beta.visl.sdu.dk/cg3/single/#sub-stream-apertium for more information. """ if ltr: return reading[0].tags[0] # type: ignore else: return reading[-1].tags[0] # type: ignore
def _parse_tags(tag_str): # type: (str) -> List[str] in_tag = False tags = [] buf = '' stream = (c for c in tag_str) for c in stream: if not in_tag and c == '<': in_tag = True continue elif c == '\\': buf += c buf += next(stream) elif c == '>': tags.append(buf) buf = '' in_tag = False else: buf += c if buf != '': tags.append(buf) return tags def _parse_subreading(reading): # type: (str) -> List[Tuple[str, str]] in_lemma = True lemma = '' subs = [] buf = '' stream = (c for c in reading) for c in stream: if c == '+': subs.append((lemma, buf)) buf = '' lemma = '' in_lemma = True continue elif c == '\\': buf += c buf += next(stream) elif in_lemma and c == '<': in_lemma = False lemma = buf buf = '' buf += c else: buf += c if buf != '': if in_lemma: subs.append((lemma + buf, '')) else: subs.append((lemma, buf)) return subs
[docs]class LexicalUnit: """A lexical unit consisting of a lemma and its readings. Attributes: lexical_unit (str): The lexical unit in Apertium stream format. wordform (str): The word form (surface form) of the lexical unit. wordbound_blank (str): The wordbound blank of the lexical unit. readings (List[List[:class:`SReading`]]): The analyses of the lexical unit with sublists containing all subreadings. knownness (:class:`Knownness`): The level of knowledge of the lexical unit. """ def __init__(self, lexical_unit): # type: (str) -> None self.lexical_unit = lexical_unit cohort = re.split(r'(?<!\\)/', lexical_unit) if ']]^' in cohort[0]: self.wordbound_blank, self.wordform = cohort[0].split(']]^', 1) self.wordbound_blank += ']]' else: self.wordbound_blank = '' self.wordform = cohort[0] readings = cohort[1:] if len(readings) == 1: self.knownness = _symbol_to_knownness(readings[0][:1]) else: self.knownness = known self.readings = [] # type: List[List[SReading]] for reading in readings: if len(reading) < 1: warnings.warn('Empty readings for {}'.format(self.lexical_unit), RuntimeWarning) else: subreadings = [] for subreading in _parse_subreading(reading): baseform = subreading[0].lstrip('+') tags = _parse_tags(subreading[1]) subreadings.append(SReading(baseform=baseform, tags=tags)) self.readings.append(subreadings) def __repr__(self): # type: () -> str return self.lexical_unit
[docs]@functools.singledispatch def parse(stream, with_text=False): # type: (Iterator[str], bool) -> Iterator[Union[Tuple[str, LexicalUnit], LexicalUnit]] """Generates lexical units from a character stream. Args: stream (Iterator[str]): A character stream containing lexical units, superblanks and other text. with_text (Optional[bool]): A boolean defining whether to output preceding text with each lexical unit. Yields: :class:`LexicalUnit`: The next lexical unit found in the character stream. (if `with_text` is False) \n *(str, LexicalUnit)* - The next lexical unit found in the character stream and the the text that seperated it from the prior unit in a tuple. (if with_text is True) """ buffer = '' text_buffer = '' in_lexical_unit = False in_superblank = False for char in stream: if in_superblank: if char == ']': in_superblank = False text_buffer += char elif char == '\\': text_buffer += char text_buffer += next(stream) else: text_buffer += char elif in_lexical_unit: if char == '$': if with_text: yield (text_buffer, LexicalUnit(buffer)) else: yield LexicalUnit(buffer) buffer = '' text_buffer = '' in_lexical_unit = False elif char == '\\': buffer += char buffer += next(stream) else: buffer += char else: if char == '[': next_char = next(stream) if next_char == '[': buffer += '[[' in_lexical_unit = True else: in_superblank = True text_buffer += char if next_char == ']': in_superblank = False text_buffer += next_char elif next_char == '\\': text_buffer += next_char text_buffer += next(stream) else: text_buffer += next_char elif char == '^': in_lexical_unit = True elif char == '\\': text_buffer += char text_buffer += next(stream) else: text_buffer += char
@parse.register(str) def _parse_str(str, **kwargs): # type: (str, dict) -> Iterator[Union[Tuple[str, LexicalUnit], LexicalUnit]] return parse(iter(str), **kwargs)
[docs]def parse_file(f, **kwargs): # type: (Iterable, dict) -> Iterator[Union[Tuple[str, LexicalUnit], LexicalUnit]] """Generates lexical units from a file. Args: f (file): A file containing lexical units, superblanks and other text. Yields: :class:`LexicalUnit`: The next lexical unit found in the file. """ return parse(itertools.chain.from_iterable(f), **kwargs)
def main(): # type: () -> None lexical_units = parse_file(fileinput.input()) for lexical_unit in lexical_units: pprint.pprint(lexical_unit.readings, width=120) # type: ignore if __name__ == '__main__': main()