You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
458 lines
16 KiB
458 lines
16 KiB
8 months ago
|
"""
|
||
|
LaTeX Codec
|
||
|
~~~~~~~~~~~
|
||
|
|
||
|
The :mod:`latexcodec.codec` module
|
||
|
contains all classes and functions for LaTeX code
|
||
|
translation. For practical use,
|
||
|
you should only ever need to import the :mod:`latexcodec` module,
|
||
|
which will automatically register the codec
|
||
|
so it can be used by :meth:`str.encode`, :meth:`str.decode`,
|
||
|
and any of the functions defined in the :mod:`codecs` module
|
||
|
such as :func:`codecs.open` and so on.
|
||
|
The other functions and classes
|
||
|
are exposed in case someone would want to extend them.
|
||
|
|
||
|
.. autofunction:: register
|
||
|
|
||
|
.. autofunction:: find_latex
|
||
|
|
||
|
.. autoclass:: LatexIncrementalEncoder
|
||
|
:show-inheritance:
|
||
|
:members:
|
||
|
|
||
|
.. autoclass:: LatexIncrementalDecoder
|
||
|
:show-inheritance:
|
||
|
:members:
|
||
|
|
||
|
.. autoclass:: LatexCodec
|
||
|
:show-inheritance:
|
||
|
:members:
|
||
|
|
||
|
.. autoclass:: LatexUnicodeTable
|
||
|
:members:
|
||
|
"""
|
||
|
|
||
|
# Copyright (c) 2003, 2008 David Eppstein
|
||
|
# Copyright (c) 2011-2020 Matthias C. M. Troffaes
|
||
|
#
|
||
|
# Permission is hereby granted, free of charge, to any person
|
||
|
# obtaining a copy of this software and associated documentation
|
||
|
# files (the "Software"), to deal in the Software without
|
||
|
# restriction, including without limitation the rights to use,
|
||
|
# copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||
|
# copies of the Software, and to permit persons to whom the
|
||
|
# Software is furnished to do so, subject to the following
|
||
|
# conditions:
|
||
|
#
|
||
|
# The above copyright notice and this permission notice shall be
|
||
|
# included in all copies or substantial portions of the Software.
|
||
|
#
|
||
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||
|
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||
|
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||
|
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||
|
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||
|
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||
|
# OTHER DEALINGS IN THE SOFTWARE.
|
||
|
|
||
|
import codecs
|
||
|
import dataclasses
|
||
|
import unicodedata
|
||
|
from typing import Optional, List, Union, Any, Iterator, Tuple, Type, Dict
|
||
|
|
||
|
try:
|
||
|
import importlib.resources as pkg_resources
|
||
|
except ImportError:
|
||
|
import importlib_resources as pkg_resources
|
||
|
|
||
|
|
||
|
from latexcodec import lexer
|
||
|
from codecs import CodecInfo
|
||
|
|
||
|
|
||
|
def register():
|
||
|
"""Register the :func:`find_latex` codec search function.
|
||
|
|
||
|
.. seealso:: :func:`codecs.register`
|
||
|
"""
|
||
|
codecs.register(find_latex)
|
||
|
|
||
|
# returns the codec search function
|
||
|
# this is used if latex_codec.py were to be placed in stdlib
|
||
|
|
||
|
|
||
|
def getregentry() -> Optional[CodecInfo]:
|
||
|
"""Encodings module API."""
|
||
|
return find_latex('latex')
|
||
|
|
||
|
|
||
|
@dataclasses.dataclass
|
||
|
class UnicodeLatexTranslation:
|
||
|
unicode: str
|
||
|
latex: str
|
||
|
encode: bool #: Suitable for unicode -> latex.
|
||
|
decode: bool #: Suitable for latex -> unicode.
|
||
|
text_mode: bool #: Latex works in text mode.
|
||
|
math_mode: bool #: Latex works in math mode.
|
||
|
|
||
|
|
||
|
def load_unicode_latex_table() -> Iterator[UnicodeLatexTranslation]:
|
||
|
with pkg_resources.open_text('latexcodec', 'table.txt') as datafile:
|
||
|
for line in datafile:
|
||
|
marker, unicode_names, latex = line.rstrip('\r\n').split('\u0009')
|
||
|
unicode = ''.join(
|
||
|
unicodedata.lookup(name) for name in unicode_names.split(','))
|
||
|
yield UnicodeLatexTranslation(
|
||
|
unicode=unicode,
|
||
|
latex=latex,
|
||
|
encode=marker[1] in {'-', '>'},
|
||
|
decode=marker[1] in {'-', '<'},
|
||
|
text_mode=marker[0] in {'A', 'T'},
|
||
|
math_mode=marker[0] in {'A', 'M'},
|
||
|
)
|
||
|
|
||
|
|
||
|
class LatexUnicodeTable:
|
||
|
"""Tabulates a translation between LaTeX and unicode."""
|
||
|
|
||
|
def __init__(self, lexer_):
|
||
|
self.lexer: lexer.LatexIncrementalLexer = lexer_
|
||
|
self.unicode_map: Dict[Tuple[lexer.Token, ...], str] = {}
|
||
|
self.max_length: int = 0
|
||
|
self.latex_map: Dict[
|
||
|
str, Tuple[str, Tuple[lexer.Token, ...]]] = {}
|
||
|
self.register_all()
|
||
|
|
||
|
def register_all(self):
|
||
|
"""Register all symbols and their LaTeX equivalents
|
||
|
(called by constructor).
|
||
|
"""
|
||
|
# register special symbols
|
||
|
self.register(UnicodeLatexTranslation(
|
||
|
unicode='\n\n',
|
||
|
latex=' \\par',
|
||
|
encode=False,
|
||
|
decode=True,
|
||
|
text_mode=True,
|
||
|
math_mode=False,
|
||
|
))
|
||
|
self.register(UnicodeLatexTranslation(
|
||
|
unicode='\n\n',
|
||
|
latex='\\par',
|
||
|
encode=False,
|
||
|
decode=True,
|
||
|
text_mode=True,
|
||
|
math_mode=False,
|
||
|
))
|
||
|
for trans in load_unicode_latex_table():
|
||
|
self.register(trans)
|
||
|
|
||
|
def register(self, trans: UnicodeLatexTranslation):
|
||
|
"""Register a correspondence between *unicode_text* and *latex_text*.
|
||
|
|
||
|
:param UnicodeLatexTranslation trans: Description of translation.
|
||
|
"""
|
||
|
if trans.math_mode and not trans.text_mode:
|
||
|
# also register text version
|
||
|
self.register(UnicodeLatexTranslation(
|
||
|
unicode=trans.unicode,
|
||
|
latex='$' + trans.latex + '$',
|
||
|
text_mode=True,
|
||
|
math_mode=False,
|
||
|
decode=trans.decode,
|
||
|
encode=trans.encode,
|
||
|
))
|
||
|
self.register(UnicodeLatexTranslation(
|
||
|
unicode=trans.unicode,
|
||
|
latex=r'\(' + trans.latex + r'\)',
|
||
|
text_mode=True,
|
||
|
math_mode=False,
|
||
|
decode=trans.decode,
|
||
|
encode=trans.encode,
|
||
|
))
|
||
|
# for the time being, we do not perform in-math substitutions
|
||
|
return
|
||
|
# tokenize, and register unicode translation
|
||
|
self.lexer.reset()
|
||
|
self.lexer.state = 'M'
|
||
|
tokens = tuple(self.lexer.get_tokens(trans.latex, final=True))
|
||
|
if trans.decode:
|
||
|
if tokens not in self.unicode_map:
|
||
|
self.max_length = max(self.max_length, len(tokens))
|
||
|
self.unicode_map[tokens] = trans.unicode
|
||
|
# also register token variant with brackets, if appropriate
|
||
|
# for instance, "\'{e}" for "\'e", "\c{c}" for "\c c", etc.
|
||
|
# note: we do not remove brackets (they sometimes matter,
|
||
|
# e.g. bibtex uses them to prevent lower case transformation)
|
||
|
if (len(tokens) == 2 and
|
||
|
tokens[0].name.startswith('control') and
|
||
|
tokens[1].name == 'chars'):
|
||
|
self.register(UnicodeLatexTranslation(
|
||
|
unicode=f"{{{trans.unicode}}}",
|
||
|
latex=f"{tokens[0].text}{{{tokens[1].text}}}",
|
||
|
decode=True, encode=False,
|
||
|
math_mode=trans.math_mode,
|
||
|
text_mode=trans.text_mode,
|
||
|
))
|
||
|
if (len(tokens) == 4 and
|
||
|
tokens[0].text in {'$', r'\('} and
|
||
|
tokens[1].name.startswith('control') and
|
||
|
tokens[2].name == 'chars' and
|
||
|
tokens[3].text in {'$', r'\)'}):
|
||
|
# drop brackets in this case, since it is math mode
|
||
|
self.register(UnicodeLatexTranslation(
|
||
|
unicode=f"{trans.unicode}",
|
||
|
latex=f"{tokens[0].text}{tokens[1].text}"
|
||
|
f"{{{tokens[2].text}}}{tokens[3].text}",
|
||
|
decode=True, encode=False,
|
||
|
math_mode=trans.math_mode,
|
||
|
text_mode=trans.text_mode,
|
||
|
))
|
||
|
if trans.encode and trans.unicode not in self.latex_map:
|
||
|
assert len(trans.unicode) == 1
|
||
|
self.latex_map[trans.unicode] = (trans.latex, tokens)
|
||
|
|
||
|
|
||
|
_LATEX_UNICODE_TABLE = LatexUnicodeTable(lexer.LatexIncrementalDecoder())
|
||
|
|
||
|
# incremental encoder does not need a buffer
|
||
|
# but decoder does
|
||
|
|
||
|
|
||
|
class LatexIncrementalEncoder(lexer.LatexIncrementalEncoder):
|
||
|
|
||
|
"""Translating incremental encoder for latex. Maintains a state to
|
||
|
determine whether control spaces etc. need to be inserted.
|
||
|
"""
|
||
|
|
||
|
emptytoken = lexer.Token("unknown", "") #: The empty token.
|
||
|
table = _LATEX_UNICODE_TABLE #: Translation table.
|
||
|
state: str
|
||
|
|
||
|
def __init__(self, errors='strict'):
|
||
|
super().__init__(errors=errors)
|
||
|
self.reset()
|
||
|
|
||
|
def reset(self):
|
||
|
super(LatexIncrementalEncoder, self).reset()
|
||
|
self.state = 'M'
|
||
|
|
||
|
def get_space_bytes(self, bytes_: str) -> Tuple[str, str]:
|
||
|
"""Inserts space bytes in space eating mode."""
|
||
|
if self.state == 'S':
|
||
|
# in space eating mode
|
||
|
# control space needed?
|
||
|
if bytes_.startswith(' '):
|
||
|
# replace by control space
|
||
|
return '\\ ', bytes_[1:]
|
||
|
else:
|
||
|
# insert space (it is eaten, but needed for separation)
|
||
|
return ' ', bytes_
|
||
|
else:
|
||
|
return '', bytes_
|
||
|
|
||
|
def _get_latex_chars_tokens_from_char(
|
||
|
self, c: str) -> Tuple[str, Tuple[lexer.Token, ...]]:
|
||
|
# if ascii, try latex equivalents
|
||
|
# (this covers \, #, &, and other special LaTeX characters)
|
||
|
if ord(c) < 128:
|
||
|
try:
|
||
|
return self.table.latex_map[c]
|
||
|
except KeyError:
|
||
|
pass
|
||
|
# next, try input encoding
|
||
|
try:
|
||
|
c.encode(self.inputenc, 'strict')
|
||
|
except UnicodeEncodeError:
|
||
|
pass
|
||
|
else:
|
||
|
return c, (lexer.Token(name='chars', text=c),)
|
||
|
# next, try latex equivalents of common unicode characters
|
||
|
try:
|
||
|
return self.table.latex_map[c]
|
||
|
except KeyError:
|
||
|
# translation failed
|
||
|
if self.errors == 'strict':
|
||
|
raise UnicodeEncodeError(
|
||
|
"latex", # codec
|
||
|
c, # problematic input
|
||
|
0, 1, # location of problematic character
|
||
|
"don't know how to translate {0} into latex"
|
||
|
.format(repr(c)))
|
||
|
elif self.errors == 'ignore':
|
||
|
return '', (self.emptytoken,)
|
||
|
elif self.errors == 'replace':
|
||
|
# use the \\char command
|
||
|
# this assumes
|
||
|
# \usepackage[T1]{fontenc}
|
||
|
# \usepackage[utf8]{inputenc}
|
||
|
bytes_ = '{\\char' + str(ord(c)) + '}'
|
||
|
return bytes_, (lexer.Token(name='chars', text=bytes_),)
|
||
|
elif self.errors == 'keep':
|
||
|
return c, (lexer.Token(name='chars', text=c),)
|
||
|
else:
|
||
|
raise ValueError(
|
||
|
"latex codec does not support {0} errors"
|
||
|
.format(self.errors))
|
||
|
|
||
|
def get_latex_chars(
|
||
|
self, unicode_: str, final: bool = False) -> Iterator[str]:
|
||
|
if not isinstance(unicode_, str):
|
||
|
raise TypeError(
|
||
|
"expected unicode for encode input, but got {0} instead"
|
||
|
.format(unicode_.__class__.__name__))
|
||
|
# convert character by character
|
||
|
for pos, c in enumerate(unicode_):
|
||
|
bytes_, tokens = self._get_latex_chars_tokens_from_char(c)
|
||
|
space, bytes_ = self.get_space_bytes(bytes_)
|
||
|
# update state
|
||
|
if tokens and tokens[-1].name == 'control_word':
|
||
|
# we're eating spaces
|
||
|
self.state = 'S'
|
||
|
elif tokens:
|
||
|
self.state = 'M'
|
||
|
if space:
|
||
|
yield space
|
||
|
yield bytes_
|
||
|
|
||
|
|
||
|
class LatexIncrementalDecoder(lexer.LatexIncrementalDecoder):
|
||
|
|
||
|
"""Translating incremental decoder for LaTeX."""
|
||
|
|
||
|
table = _LATEX_UNICODE_TABLE #: Translation table.
|
||
|
token_buffer: List[lexer.Token] #: The token buffer of this decoder.
|
||
|
|
||
|
def __init__(self, errors='strict'):
|
||
|
lexer.LatexIncrementalDecoder.__init__(self, errors=errors)
|
||
|
|
||
|
def reset(self):
|
||
|
lexer.LatexIncrementalDecoder.reset(self)
|
||
|
self.token_buffer = []
|
||
|
|
||
|
# python codecs API does not support multibuffer incremental decoders
|
||
|
|
||
|
def getstate(self) -> Any:
|
||
|
raise NotImplementedError
|
||
|
|
||
|
def setstate(self, state: Any) -> None:
|
||
|
raise NotImplementedError
|
||
|
|
||
|
def get_unicode_tokens(self, chars: str, final: bool = False
|
||
|
) -> Iterator[str]:
|
||
|
for token in self.get_tokens(chars, final=final):
|
||
|
# at this point, token_buffer does not match anything
|
||
|
self.token_buffer.append(token)
|
||
|
# new token appended at the end, see if we have a match now
|
||
|
# note: match is only possible at the *end* of the buffer
|
||
|
# because all other positions have already been checked in
|
||
|
# earlier iterations
|
||
|
for i in range(len(self.token_buffer), 0, -1):
|
||
|
last_tokens = tuple(self.token_buffer[-i:]) # last i tokens
|
||
|
try:
|
||
|
unicode_text = self.table.unicode_map[last_tokens]
|
||
|
except KeyError:
|
||
|
# no match: continue
|
||
|
continue
|
||
|
else:
|
||
|
# match!! flush buffer, and translate last bit
|
||
|
# exclude last i tokens
|
||
|
for token2 in self.token_buffer[:-i]:
|
||
|
yield self.decode_token(token2)
|
||
|
yield unicode_text
|
||
|
self.token_buffer = []
|
||
|
break
|
||
|
# flush tokens that can no longer match
|
||
|
while len(self.token_buffer) >= self.table.max_length:
|
||
|
yield self.decode_token(self.token_buffer.pop(0))
|
||
|
# also flush the buffer at the end
|
||
|
if final:
|
||
|
for token in self.token_buffer:
|
||
|
yield self.decode_token(token)
|
||
|
self.token_buffer = []
|
||
|
|
||
|
|
||
|
class LatexCodec(codecs.Codec):
|
||
|
IncrementalEncoder: Type[LatexIncrementalEncoder]
|
||
|
IncrementalDecoder: Type[LatexIncrementalDecoder]
|
||
|
|
||
|
def encode(self, unicode_: str, errors='strict' # type: ignore
|
||
|
) -> Tuple[Union[bytes, str], int]:
|
||
|
"""Convert unicode string to LaTeX bytes."""
|
||
|
encoder = self.IncrementalEncoder(errors=errors)
|
||
|
return encoder.encode(unicode_, final=True), len(unicode_)
|
||
|
|
||
|
def decode(self, bytes_: Union[bytes, str], errors='strict'
|
||
|
) -> Tuple[str, int]:
|
||
|
"""Convert LaTeX bytes to unicode string."""
|
||
|
decoder = self.IncrementalDecoder(errors=errors)
|
||
|
return decoder.decode(bytes_, final=True), len(bytes_) # type: ignore
|
||
|
|
||
|
|
||
|
class UnicodeLatexIncrementalDecoder(LatexIncrementalDecoder):
|
||
|
|
||
|
def decode(self, bytes_: str, final: bool = False) -> str: # type: ignore
|
||
|
return self.udecode(bytes_, final)
|
||
|
|
||
|
|
||
|
class UnicodeLatexIncrementalEncoder(LatexIncrementalEncoder):
|
||
|
|
||
|
def encode(self, unicode_: str, final: bool = False # type: ignore
|
||
|
) -> str:
|
||
|
return self.uencode(unicode_, final)
|
||
|
|
||
|
|
||
|
def find_latex(encoding: str) -> Optional[CodecInfo]:
|
||
|
"""Return a :class:`codecs.CodecInfo` instance for the requested
|
||
|
LaTeX *encoding*, which must be equal to ``latex``,
|
||
|
or to ``latex+<encoding>``
|
||
|
where ``<encoding>`` describes another encoding.
|
||
|
"""
|
||
|
IncEnc: Type[LatexIncrementalEncoder]
|
||
|
IncDec: Type[LatexIncrementalDecoder]
|
||
|
if '_' in encoding:
|
||
|
# Python 3.9 now normalizes "latex+latin1" to "latex_latin1"
|
||
|
# https://bugs.python.org/issue37751
|
||
|
encoding, _, inputenc_ = encoding.partition("_")
|
||
|
else:
|
||
|
encoding, _, inputenc_ = encoding.partition("+")
|
||
|
if not inputenc_:
|
||
|
inputenc_ = "ascii"
|
||
|
if encoding == "latex":
|
||
|
incremental_encoder = type(
|
||
|
"incremental_encoder", (LatexIncrementalEncoder,),
|
||
|
dict(inputenc=inputenc_))
|
||
|
incremental_decoder = type(
|
||
|
"incremental_encoder", (LatexIncrementalDecoder,),
|
||
|
dict(inputenc=inputenc_))
|
||
|
elif encoding == "ulatex":
|
||
|
incremental_encoder = type(
|
||
|
"incremental_encoder", (UnicodeLatexIncrementalEncoder,),
|
||
|
dict(inputenc=inputenc_))
|
||
|
incremental_decoder = type(
|
||
|
"incremental_encoder", (UnicodeLatexIncrementalDecoder,),
|
||
|
dict(inputenc=inputenc_))
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
class Codec(LatexCodec):
|
||
|
IncrementalEncoder = incremental_encoder
|
||
|
IncrementalDecoder = incremental_decoder
|
||
|
|
||
|
class StreamWriter(Codec, codecs.StreamWriter):
|
||
|
pass
|
||
|
|
||
|
class StreamReader(Codec, codecs.StreamReader):
|
||
|
pass
|
||
|
|
||
|
return codecs.CodecInfo(
|
||
|
encode=Codec().encode, # type: ignore
|
||
|
decode=Codec().decode, # type: ignore
|
||
|
incrementalencoder=Codec.IncrementalEncoder,
|
||
|
incrementaldecoder=Codec.IncrementalDecoder,
|
||
|
streamreader=StreamReader,
|
||
|
streamwriter=StreamWriter,
|
||
|
)
|