diff --git a/README.md b/README.md index 424afba..840caea 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,14 @@ the log file and collect the statistics. from texoutparse import LatexLogParser parser = LatexLogParser() + +# If using a unicode-supporting engine, e.g. XeTeX or LuaTeX/LuaHBTeX with open('sample.log') as f: parser.process(f) + +# If using an 8-bit engine, e.g. TeX or pdfTeX +with open('sample.log', encoding='latin-1') as f: + parser.process(f) ``` The `parser` object contains lists of errors, warnings, and bad boxes, each described by an `LogFileMessage` object. Both objects provide a `__str__` method that prints a summary of the @@ -55,4 +61,4 @@ Sam Morley - [inakleinbottle.com](https://inakleinbottle.com) - admin@inakleinbo Distributed under the MIT license. See `LICENSE` for more information. ## Release History - - 1.0. Initial release \ No newline at end of file + - 1.0. Initial release diff --git a/texoutparse.py b/texoutparse.py index 48de104..8e46f12 100644 --- a/texoutparse.py +++ b/texoutparse.py @@ -1,10 +1,16 @@ """ Parser for LaTeX log files. """ +import io import re +import codecs +import warnings from collections import deque +KNOWN_NONUNICODE_ENGINES = ['TeX', 'eTeX', 'pdfTeX'] + + class LogFileMessage: """ Helper class for storing log file messages. @@ -74,15 +80,18 @@ class LatexLogParser: list. """ + engine = re.compile( + r"^This is (\w+), Version ([\w\d.-]+)" + ) error = re.compile( - r"^(?:! ((?:La|pdf)TeX|Package|Class)(?: (\w+))? [eE]rror(?: \(([\\]?\w+)\))?: (.*)|! (.*))" + r"^(?:! ((?:\w*)TeX|Package|Class)(?: (\w+))? [eE]rror(?: \(([\\]?\w+)\))?: (.*)|! (.*))" ) warning = re.compile( - r"^((?:La|pdf)TeX|Package|Class)(?: (\w+))? [wW]arning(?: \(([\\]?\w+)\))?: (.*)" + r"^((?:\w*)TeX|Package|Class)(?: (\w+))? [wW]arning(?: \(([\\]?\w+)\))?: (.*)" ) info = re.compile( - r"^((?:La|pdf)TeX|Package|Class)(?: (\w+))? [iI]nfo(?: \(([\\]?\w+)\))?: (.*)" + r"^((?:\w*)TeX|Package|Class)(?: (\w+))? [iI]nfo(?: \(([\\]?\w+)\))?: (.*)" ) badbox = re.compile( r"^(Over|Under)full " @@ -101,6 +110,7 @@ def __init__(self, context_lines=2): self.errors = [] self.badboxes = [] self.missing_refs = [] + self.version = None self.context_lines = context_lines def __str__(self): @@ -117,6 +127,8 @@ def process(self, lines): :param lines: Iterable over lines of log. """ + self.process_header(lines) + lines_iterable = _LineIterWrapper(lines, self.context_lines) # cache the line processor for speed @@ -207,7 +219,7 @@ def process_warning(self, match): # Regex match groups # 0 - Whole match (line) - # 1 - Type ((?:La|pdf)TeX|Package|Class) + # 1 - Type ((?:\w*)TeX|Package|Class) # 2 - Package or Class name (\w*) # 3 - extra # 4 - Warning message (.*) @@ -243,7 +255,7 @@ def process_error(self, match): # Regex match groups # 0 - Whole match (line) - # 1 - Type (LaTeX|Package|Class) + # 1 - Type ((?:\w*)TeX|Package|Class) # 2 - Package or Class (\w+) # 3 - extra (\(([\\]\w+)\)) # 4 - Error message for typed error (.*) @@ -288,7 +300,37 @@ def process_missing_ref(self, match): self.missing_refs.append(message) return message + def process_engine(self, match): + message = LogFileMessage() + message['engine'] = match.group(1) + message['version'] = match.group(2) + self.engine = message + return message - - + def process_header(self, lines): + """ + The first line of output should contain information about the engine, e.g. LuaHBTeX, Version 1.13.0, among other information. + We attempt to read it and silently fail if we can't since it is not crucial for the subsequent work of the parser. + """ + try: + first_line = next(lines) + except StopIteration: + return + + engine_match = self.engine.match(first_line) + if engine_match is None: + return + + self.process_engine(engine_match) + engine_name = self.engine['engine'] + + if (isinstance(lines, io.TextIOBase) and codecs.lookup(lines.encoding) == codecs.lookup('utf-8') and engine_name in KNOWN_NONUNICODE_ENGINES): + warnings.warn( + ' '.join(( + f'You are attempting to read unicode output from the non-unicode engine {engine_name}.', + 'This will likely result in a UnicodeDecodeError.', + "Consider changing the encoding to 'latin-1' when reading the file." + )), + UnicodeWarning + )