# Copyright (c) 2006-2019 Andrey Golovigin # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. r"""(simple but) rich text formatting tools Usage: >>> t = Text('this ', 'is a ', Tag('em', 'very'), Text(' rich', ' text')) >>> print(t.render_as('latex')) this is a \emph{very} rich text >>> print(six.text_type(t)) this is a very rich text >>> t = t.capitalize().add_period() >>> print(t.render_as('latex')) This is a \emph{very} rich text. >>> print(six.text_type(t)) This is a very rich text. >>> print(Symbol('ndash').render_as('latex')) -- >>> t = Text('Some ', Tag('em', Text('nested ', Tag('tt', 'Text', Text(' objects')))), '.') >>> print(t.render_as('latex')) Some \emph{nested \texttt{Text objects}}. >>> print(six.text_type(t)) Some nested Text objects. >>> t = t.upper() >>> print(t.render_as('latex')) SOME \emph{NESTED \texttt{TEXT OBJECTS}}. >>> print(six.text_type(t)) SOME NESTED TEXT OBJECTS. >>> t = Text(', ').join(['one', 'two', Tag('em', 'three')]) >>> print(t.render_as('latex')) one, two, \emph{three} >>> print(six.text_type(t)) one, two, three >>> t = Text(Symbol('nbsp')).join(['one', 'two', Tag('em', 'three')]) >>> print(t.render_as('latex')) one~two~\emph{three} >>> print(six.text_type(t)) onetwothree """ from __future__ import absolute_import, unicode_literals import itertools import warnings from abc import ABCMeta, abstractmethod import six from pybtex import textutils from pybtex.utils import collect_iterable, deprecated from pybtex import py3compat # workaround for doctests in Python 2/3 def str_repr(string): """ >>> print(str_repr('test')) 'test' >>> print(str_repr(u'test')) 'test' """ result = repr(string) if result.startswith('u'): return result[1:] else: return result def ensure_text(value): if isinstance(value, six.string_types): return String(value) elif isinstance(value, BaseText): return value else: bad_type = type(value).__name__ raise ValueError('parts must be strings or BaseText instances, not ' + bad_type) @py3compat.python_2_unicode_compatible class BaseText(object): __metaclass__ = ABCMeta @abstractmethod def __str__(self): raise NotImplementedError @abstractmethod def __eq__(self, other): raise NotImplementedError def __ne__(self, other): return not self == other @abstractmethod def __len__(self): raise NotImplementedError @abstractmethod def __contains__(self, item): raise NotImplementedError @abstractmethod def __getitem__(self, key): raise NotImplementedError def __add__(self, other): """ Concatenate this Text with another Text or string. >>> Text('Longcat is ') + Tag('em', 'long') Text('Longcat is ', Tag('em', 'long')) """ return Text(self, other) def append(self, text): """ Append text to the end of this text. Normally, this is the same as concatenating texts with +, but for tags and similar objects the appended text is placed _inside_ the tag. >>> text = Tag('em', 'Look here') >>> print((text + '!').render_as('html')) Look here! >>> print(text.append('!').render_as('html')) Look here! """ return self + text def join(self, parts): """Join a list using this text (like string.join) >>> letters = ['a', 'b', 'c'] >>> print(six.text_type(String('-').join(letters))) a-b-c >>> print(six.text_type(String('-').join(iter(letters)))) a-b-c """ if not parts: return Text() joined = [] for part in parts: if joined: joined.append(self) joined.append(part) return Text(*joined) @abstractmethod def split(self, sep=None, keep_empty_parts=None): raise NotImplementedError @abstractmethod def startswith(self, prefix): """ Return True if string starts with the prefix, otherwise return False. prefix can also be a tuple of suffixes to look for. """ raise NotImplementedError @abstractmethod def endswith(self, suffix): """ Return True if the string ends with the specified suffix, otherwise return False. suffix can also be a tuple of suffixes to look for. """ raise NotImplementedError @abstractmethod def isalpha(self): raise NotImplementedError def add_period(self, period='.'): """ Add a period to the end of text, if the last character is not ".", "!" or "?". >>> text = Text("That's all, folks") >>> print(six.text_type(text.add_period())) That's all, folks. >>> text = Text("That's all, folks!") >>> print(six.text_type(text.add_period())) That's all, folks! """ if self and not textutils.is_terminated(self): return self.append(period) else: return self def abbreviate(self): def abbreviate_word(word): if word.isalpha(): return word[0].add_period() else: return word parts = self.split(textutils.delimiter_re) return String('').join(abbreviate_word(part) for part in parts) def capfirst(self): """ Capitalize the first letter of the text. >>> Text(Tag('em', 'long Cat')).capfirst() Text(Tag('em', 'Long Cat')) """ return self[:1].upper() + self[1:] def capitalize(self): """ Capitalize the first letter of the text and lowercase the rest. >>> Text(Tag('em', 'LONG CAT')).capitalize() Text(Tag('em', 'Long cat')) """ return self[:1].upper() + self[1:].lower() @abstractmethod def lower(self): raise NotImplementedError @abstractmethod def upper(self): raise NotImplementedError @abstractmethod def render(self, backend): raise NotImplementedError def render_as(self, backend_name): r""" Render this :py:class:`Text` into markup. This is a wrapper method that loads a formatting backend plugin and calls :py:meth:`Text.render`. >>> text = Text('Longcat is ', Tag('em', 'looooooong'), '!') >>> print(text.render_as('html')) Longcat is looooooong! >>> print(text.render_as('latex')) Longcat is \emph{looooooong}! >>> print(text.render_as('text')) Longcat is looooooong! :param backend_name: The name of the output backend (like ``"latex"`` or ``"html"``). """ from pybtex.plugin import find_plugin backend_cls = find_plugin('pybtex.backends', backend_name) return self.render(backend_cls()) def _unpack(self): """ For Text object, iterate over all text parts. Else, yield the object itself. Used for unpacking Text objects passed as children to another Text object. """ yield self def _typeinfo(self): """ Return the type of this object and its parameters (not including the actual text content). Used for: - merging similar tags together (AB -> AB), - creating similar text objects with different text content. """ return None, () @py3compat.python_2_unicode_compatible class BaseMultipartText(BaseText): info = () def __init__(self, *parts): """Create a text object consisting of one or more parts. Empty parts are ignored: >>> Text() == Text('') == Text('', '', '') True >>> Text('Word', '') == Text('Word') True Text() objects are unpacked and their children are included directly: >>> Text(Text('Multi', ' '), Tag('em', 'part'), Text(' ', Text('text!'))) Text('Multi ', Tag('em', 'part'), ' text!') >>> Tag('strong', Text('Multi', ' '), Tag('em', 'part'), Text(' ', 'text!')) Tag('strong', 'Multi ', Tag('em', 'part'), ' text!') Similar objects are merged together: >>> Text('Multi', Tag('em', 'part'), Text(Tag('em', ' ', 'text!'))) Text('Multi', Tag('em', 'part text!')) >>> Text('Please ', HRef('/', 'click'), HRef('/', ' here'), '.') Text('Please ', HRef('/', 'click here'), '.') """ parts = (ensure_text(part) for part in parts) nonempty_parts = (part for part in parts if part) unpacked_parts = itertools.chain(*[part._unpack() for part in nonempty_parts]) merged_parts = self._merge_similar(unpacked_parts) self.parts = list(merged_parts) self.length = sum(len(part) for part in self.parts) def __str__(self): return ''.join(six.text_type(part) for part in self.parts) def __eq__(self, other): """ Rich text objects support equality comparison: >>> Text('Cat') == Text('cat') False >>> Text('Cat') == Text('Cat') True """ return ( isinstance(other, BaseText) and self._typeinfo() == other._typeinfo() and self.parts == other.parts ) def __len__(self): """ ``len(text)`` returns the number of characters in the text, ignoring the markup: >>> len(Text('Long cat')) 8 >>> len(Text(Tag('em', 'Long'), ' cat')) 8 >>> len(Text(HRef('http://example.com/', 'Long'), ' cat')) 8 """ return self.length def __contains__(self, item): """ ``value in text`` returns ``True`` if any part of the ``text`` contains the substring ``value``: >>> 'Long cat' in Text('Long cat!') True Substrings splitted across multiple text parts are not matched: >>> 'Long cat' in Text(Tag('em', 'Long'), 'cat!') False """ if not isinstance(item, six.string_types): raise TypeError(item) return not item or any(part.__contains__(item) for part in self.parts) def __getitem__(self, key): """ Slicing and extracting characters works like with regular strings, formatting is preserved. >>> Text('Longcat is ', Tag('em', 'looooooong!'))[:15] Text('Longcat is ', Tag('em', 'looo')) >>> Text('Longcat is ', Tag('em', 'looooooong!'))[-1] Text(Tag('em', '!')) """ if isinstance(key, six.integer_types): start = key end = None elif isinstance(key, slice): start, end, step = key.indices(len(self)) if step != 1: raise NotImplementedError else: raise TypeError(key, type(key)) if start < 0: start = len(self) + start if end is None: end = start + 1 if end < 0: end = len(self) + end return self._slice_end(len(self) - start)._slice_beginning(end - start) def _slice_beginning(self, slice_length): """ Return a text consistng of the first slice_length characters of this text (with formatting preserved). """ parts = [] length = 0 for part in self.parts: if length + len(part) > slice_length: parts.append(part[:slice_length - length]) break else: parts.append(part) length += len(part) return self._create_similar(parts) def _slice_end(self, slice_length): """ Return a text consistng of the last slice_length characters of this text (with formatting preserved). """ parts = [] length = 0 for part in reversed(self.parts): if length + len(part) > slice_length: parts.append(part[len(part) - (slice_length - length):]) break else: parts.append(part) length += len(part) return self._create_similar(reversed(parts)) def append(self, text): """ Append text to the end of this text. For Tags, HRefs, etc. the appended text is placed *inside* the tag. >>> text = Tag('strong', 'Chuck Norris') >>> print((text + ' wins!').render_as('html')) Chuck Norris wins! >>> print(text.append(' wins!').render_as('html')) Chuck Norris wins! """ return self._create_similar(self.parts + [text]) @collect_iterable def split(self, sep=None, keep_empty_parts=None): """ >>> Text('a + b').split() [Text('a'), Text('+'), Text('b')] >>> Text('a, b').split(', ') [Text('a'), Text('b')] """ if keep_empty_parts is None: keep_empty_parts = sep is not None tail = [''] if keep_empty_parts else [] for part in self.parts: split_part = part.split(sep, keep_empty_parts=True) if not split_part: continue for item in split_part[:-1]: if tail: yield self._create_similar(tail + [item]) tail = [] else: if item or keep_empty_parts: yield self._create_similar([item]) tail.append(split_part[-1]) if tail: tail_text = self._create_similar(tail) if tail_text or keep_empty_parts: yield tail_text def startswith(self, prefix): """ Return True if the text starts with the given prefix. >>> Text('Longcat!').startswith('Longcat') True Prefixes split across multiple parts are not matched: >>> Text(Tag('em', 'Long'), 'cat!').startswith('Longcat') False """ if not self.parts: return False else: return self.parts[0].startswith(prefix) def endswith(self, suffix): """ Return True if the text ends with the given suffix. >>> Text('Longcat!').endswith('cat!') True Suffixes split across multiple parts are not matched: >>> Text('Long', Tag('em', 'cat'), '!').endswith('cat!') False """ if not self.parts: return False else: return self.parts[-1].endswith(suffix) def isalpha(self): """ Return True if all characters in the string are alphabetic and there is at least one character, False otherwise. """ return bool(self) and all(part.isalpha() for part in self.parts) def lower(self): """ Convert rich text to lowercase. >>> Text(Tag('em', 'Long cat')).lower() Text(Tag('em', 'long cat')) """ return self._create_similar(part.lower() for part in self.parts) def upper(self): """ Convert rich text to uppsercase. >>> Text(Tag('em', 'Long cat')).upper() Text(Tag('em', 'LONG CAT')) """ return self._create_similar(part.upper() for part in self.parts) def render(self, backend): """ Render this :py:class:`Text` into markup. :param backend: The formatting backend (an instance of :py:class:`pybtex.backends.BaseBackend`). """ rendered_list = [part.render(backend) for part in self.parts] assert all(isinstance(item, backend.RenderType) for item in rendered_list) return backend.render_sequence(rendered_list) def _typeinfo(self): """Return the type and the parameters used to create this text object. >>> text = Tag('strong', 'Heavy rain!') >>> text._typeinfo() == (Tag, ('strong',)) True """ return type(self), self.info def _create_similar(self, parts): """ Create a new text object of the same type with the same parameters, with different text content. >>> text = Tag('strong', 'Bananas!') >>> text._create_similar(['Apples!']) Tag('strong', 'Apples!') """ cls, cls_args = self._typeinfo() args = list(cls_args) + list(parts) return cls(*args) def _merge_similar(self, parts): """Merge adjacent text objects with the same type and parameters together. >>> text = Text() >>> parts = [Tag('em', 'Breaking'), Tag('em', ' '), Tag('em', 'news!')] >>> list(text._merge_similar(parts)) [Tag('em', 'Breaking news!')] """ groups = itertools.groupby(parts, lambda value: value._typeinfo()) for typeinfo, group in groups: cls, info = typeinfo group = list(group) if cls and len(group) > 1: group_parts = itertools.chain(*(text.parts for text in group)) args = list(info) + list(group_parts) yield cls(*args) else: for text in group: yield text @deprecated('0.19', 'use __unicode__() instead') def plaintext(self): return six.text_type(self) @deprecated('0.19') def enumerate(self): for n, child in enumerate(self.parts): try: for p in child.enumerate(): yield p except AttributeError: yield self, n @deprecated('0.19') def reversed(self): for n, child in reversed(list(enumerate(self.parts))): try: for p in child.reversed(): yield p except AttributeError: yield self, n @deprecated('0.19', 'use slicing instead') def get_beginning(self): try: l, i = next(self.enumerate()) except StopIteration: pass else: return l.parts[i] @deprecated('0.19', 'use slicing instead') def get_end(self): try: l, i = next(self.reversed()) except StopIteration: pass else: return l.parts[i] @deprecated('0.19', 'use slicing instead') def apply_to_start(self, f): return self.map(f, lambda index, length: index == 0) @deprecated('0.19', 'use slicing instead') def apply_to_end(self, f): return self.map(f, lambda index, length: index == length - 1) @deprecated('0.19') def map(self, f, condition=None): if condition is None: condition = lambda index, length: True def iter_map_with_condition(): length = len(self) for index, child in enumerate(self.parts): if hasattr(child, 'map'): yield child.map(f, condition) if condition(index, length) else child else: yield f(child) if condition(index, length) else child return self._create_similar(iter_map_with_condition()) @py3compat.python_2_unicode_compatible class String(BaseText): """ A :py:class:`String` is a wrapper for a plain Python string. >>> from pybtex.richtext import String >>> print(String('Crime & Punishment').render_as('text')) Crime & Punishment >>> print(String('Crime & Punishment').render_as('html')) Crime & Punishment :py:class:`String` supports the same methods as :py:class:`Text`. """ def __init__(self, *parts): """ All arguments must be plain unicode strings. Arguments are concatenated together. >>> print(six.text_type(String('November', ', ', 'December', '.'))) November, December. """ self.value = ''.join(parts) def __repr__(self): return str_repr(self.value) def __str__(self): return six.text_type(self.value) def __eq__(self, other): """ Compare two :py:class:`.String` objects. """ return type(other) == type(self) and self.value == other.value def __len__(self): return self.value.__len__() def __contains__(self, item): return self.value.__contains__(item) def __getitem__(self, index): return String(self.value.__getitem__(index)) def __add__(self, other): return BaseText.__add__(self, other) def split(self, sep=None, keep_empty_parts=None): if keep_empty_parts is None: keep_empty_parts = sep is not None if sep is None: from .textutils import whitespace_re parts = whitespace_re.split(self.value) elif isinstance(sep, six.string_types): parts = self.value.split(sep) else: try: split_method = sep.split except AttributeError: raise TypeError('sep must be None, string or compiled regular expression') else: parts = split_method(self.value) return [String(part) for part in parts if part or keep_empty_parts] def startswith(self, prefix): """ Return True if string starts with the prefix, otherwise return False. prefix can also be a tuple of suffixes to look for. """ return self.value.startswith(prefix) def endswith(self, suffix): """ Return True if the string ends with the specified suffix, otherwise return False. suffix can also be a tuple of suffixes to look for. return self.value.endswith(text) """ return self.value.endswith(suffix) def isalpha(self): return self.value.isalpha() def lower(self): return String(self.value.lower()) def upper(self): return String(self.value.upper()) @property def parts(self): return [six.text_type(self)] def _typeinfo(self): return String, () def render(self, backend): return backend.format_str(self.value) class Text(BaseMultipartText): """ The :py:class:`Text` class is the top level container that may contain :py:class:`String`, :py:class:`Tag` or :py:class:`HRef` objects. """ def __repr__(self): return 'Text({})'.format(', '.join(repr(part) for part in self.parts)) def _unpack(self): for part in self.parts: yield part @classmethod def from_latex(cls, latex): import codecs import latexcodec # noqa from pybtex.markup import LaTeXParser return LaTeXParser(codecs.decode(latex, 'ulatex')).parse() class Tag(BaseMultipartText): r""" A :py:class:`Tag` represents something like an HTML tag or a LaTeX formatting command: >>> from pybtex.richtext import Tag >>> tag = Tag('em', 'The TeXbook') >>> print(tag.render_as('html')) The TeXbook >>> print(tag.render_as('latex')) \emph{The TeXbook} :py:class:`Tag` supports the same methods as :py:class:`Text`. """ def __check_name(self, name): depr_map = {} depr_map[u'emph'] = u'em' if name in depr_map: msg = u"The tag '%s' is deprecated" % name msg += u", use '%s' instead." % depr_map[name] warnings.warn(msg, DeprecationWarning, stacklevel=3) return depr_map[name] return name def __init__(self, name, *args): if not isinstance(name, (six.string_types, Text)): raise ValueError( "name must be str or Text (got %s)" % name.__class__.__name__) self.name = self.__check_name(six.text_type(name)) self.info = self.name, super(Tag, self).__init__(*args) def __repr__(self): if self.parts: reprparts = ', '.join(repr(part) for part in self.parts) return 'Tag({}, {})'.format(str_repr(self.name), reprparts) else: return 'Tag({})'.format(str_repr(self.name)) def render(self, backend): text = super(Tag, self).render(backend) return backend.format_tag(self.name, text) class HRef(BaseMultipartText): """ A :py:class:`HRef` represends a hyperlink: >>> from pybtex.richtext import Tag >>> href = HRef('http://ctan.org/', 'CTAN') >>> print(href.render_as('html')) CTAN >>> print(href.render_as('latex')) \\href{http://ctan.org/}{CTAN} >>> href = HRef(String('http://ctan.org/'), String('http://ctan.org/')) >>> print(href.render_as('latex')) \\url{http://ctan.org/} :py:class:`HRef` supports the same methods as :py:class:`Text`. """ def __init__(self, url, *args): if not isinstance(url, (six.string_types, BaseText)): raise ValueError( "url must be str or Text (got %s)" % url.__class__.__name__) self.url = six.text_type(url) self.info = self.url, super(HRef, self).__init__(*args) def __repr__(self): reprparts = ', '.join(repr(part) for part in self.parts) return 'HRef({}, {})'.format(str_repr(self.url), reprparts) def render(self, backend): text = super(HRef, self).render(backend) return backend.format_href(self.url, text) class Protected(BaseMultipartText): r""" A :py:class:`Protected` represents a "protected" piece of text. - :py:meth:`Protected.lower`, :py:meth:`Protected.upper`, :py:meth:`Protected.capitalize`, and :py:meth:`Protected.capitalize()` are no-ops and just return the :py:class:`Protected` object itself. - :py:meth:`Protected.split` never splits the text. It always returns a one-element list containing the :py:class:`Protected` object itself. - In LaTeX output, :py:class:`Protected` is {surrounded by braces}. HTML and plain text backends just output the text as-is. >>> from pybtex.richtext import Protected >>> text = Protected('The CTAN archive') >>> text.lower() Protected('The CTAN archive') >>> text.split() [Protected('The CTAN archive')] >>> print(text.render_as('latex')) {The CTAN archive} >>> print(text.render_as('html')) The CTAN archive .. versionadded:: 0.20 """ def __init__(self, *args): super(Protected, self).__init__(*args) def __repr__(self): reprparts = ', '.join(repr(part) for part in self.parts) return 'Protected({})'.format(reprparts) def capfirst(self): return self def capitalize(self): return self def lower(self): return self def upper(self): return self def split(self, sep=None, keep_empty_parts=None): return [self] def render(self, backend): text = super(Protected, self).render(backend) return backend.format_protected(text) @py3compat.python_2_unicode_compatible class Symbol(BaseText): """A special symbol. This class is rarely used and may be removed in future versions. Examples of special symbols are non-breaking spaces and dashes. :py:class:`Symbol` supports the same methods as :py:class:`Text`. """ def __init__(self, name): self.name = name self.info = self.name, def __len__(self): return 1 def __repr__(self): return "Symbol(%s)" % str_repr(self.name) def __str__(self): # XXX return u'<%s>' % self.name def __eq__(self, other): return self.name == other.name def __contains__(self, item): return False def __getitem__(self, index): # mimic the behavior of a 1-element string try: result = 'a'[index] except IndexError: raise IndexError('richtext.Symbol index out of range') else: return self if result else String() def split(self, sep=None, keep_empty_parts=None): return [self] def startswith(self, text): return False def endswith(self, text): return False def isalpha(self): return False def render(self, backend): return backend.symbols[self.name] def upper(self): return self def lower(self): return self nbsp = Symbol('nbsp')