You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
412 lines
14 KiB
412 lines
14 KiB
# Copyright (c) 2006-2019 Andrey Golovigin
|
|
#
|
|
# Permission is hereby granted, free of charge, to any person obtaining
|
|
# a copy of this software and associated documentation files (the
|
|
# "Software"), to deal in the Software without restriction, including
|
|
# without limitation the rights to use, copy, modify, merge, publish,
|
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
|
# permit persons to whom the Software is furnished to do so, subject to
|
|
# the following conditions:
|
|
#
|
|
# The above copyright notice and this permission notice shall be
|
|
# included in all copies or substantial portions of the Software.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
"""BibTeX parser
|
|
|
|
>>> parser = Parser()
|
|
>>> bib_data = parser.parse_string(u'''
|
|
... @String{SCI = "Science"}
|
|
...
|
|
... @String{JFernandez = "Fernandez, Julio M."}
|
|
... @String{HGaub = "Gaub, Hermann E."}
|
|
... @String{MGautel = "Gautel, Mathias"}
|
|
... @String{FOesterhelt = "Oesterhelt, Filipp"}
|
|
... @String{MRief = "Rief, Matthias"}
|
|
...
|
|
... @Article{rief97b,
|
|
... author = MRief #" and "# MGautel #" and "# FOesterhelt
|
|
... #" and "# JFernandez #" and "# HGaub,
|
|
... title = "Reversible Unfolding of Individual Titin
|
|
... Immunoglobulin Domains by {AFM}",
|
|
... journal = SCI,
|
|
... volume = 276,
|
|
... number = 5315,
|
|
... pages = "1109--1112",
|
|
... year = 1997,
|
|
... doi = "10.1126/science.276.5315.1109",
|
|
... URL = "http://www.sciencemag.org/cgi/content/abstract/276/5315/1109",
|
|
... eprint = "http://www.sciencemag.org/cgi/reprint/276/5315/1109.pdf",
|
|
... }
|
|
... ''')
|
|
|
|
# entry keys are case-insensitive
|
|
>>> bib_data.entries['rief97b'] == bib_data.entries['RIEF97B']
|
|
True
|
|
|
|
>>> rief97b = bib_data.entries['rief97b']
|
|
>>> authors = rief97b.persons['author']
|
|
>>> for author in authors:
|
|
... print(six.text_type(author))
|
|
Rief, Matthias
|
|
Gautel, Mathias
|
|
Oesterhelt, Filipp
|
|
Fernandez, Julio M.
|
|
Gaub, Hermann E.
|
|
|
|
# field names are case-insensitive
|
|
>>> print(rief97b.fields['URL'])
|
|
http://www.sciencemag.org/cgi/content/abstract/276/5315/1109
|
|
>>> print(rief97b.fields['url'])
|
|
http://www.sciencemag.org/cgi/content/abstract/276/5315/1109
|
|
|
|
"""
|
|
from __future__ import unicode_literals
|
|
|
|
import re
|
|
from string import ascii_letters, digits
|
|
|
|
import six
|
|
|
|
from pybtex import textutils
|
|
from pybtex.bibtex.utils import split_name_list
|
|
from pybtex.database import Entry, Person, BibliographyDataError
|
|
from pybtex.database.input import BaseParser
|
|
from pybtex.scanner import (
|
|
Literal, Pattern, PrematureEOF, PybtexSyntaxError, Scanner
|
|
)
|
|
from pybtex.utils import CaseInsensitiveDict, CaseInsensitiveSet
|
|
|
|
month_names = {
|
|
'jan': 'January',
|
|
'feb': 'February',
|
|
'mar': 'March',
|
|
'apr': 'April',
|
|
'may': 'May',
|
|
'jun': 'June',
|
|
'jul': 'July',
|
|
'aug': 'August',
|
|
'sep': 'September',
|
|
'oct': 'October',
|
|
'nov': 'November',
|
|
'dec': 'December'
|
|
}
|
|
|
|
|
|
NAME_CHARS = ascii_letters + u'@!$&*+-./:;<>?[\\]^_`|~\x7f'
|
|
|
|
|
|
class SkipEntry(Exception):
|
|
pass
|
|
|
|
|
|
class UndefinedMacro(PybtexSyntaxError):
|
|
error_type = 'undefined string'
|
|
|
|
|
|
class DuplicateField(BibliographyDataError):
|
|
def __init__(self, entry_key, field_name):
|
|
message = 'entry with key {} has a duplicate {} field'.format(
|
|
entry_key, field_name
|
|
)
|
|
super(DuplicateField, self).__init__(message)
|
|
|
|
|
|
class LowLevelParser(Scanner):
|
|
NAME = Pattern(r'[{0}][{1}]*'.format(re.escape(NAME_CHARS), re.escape(NAME_CHARS + digits)), 'a valid name')
|
|
KEY_PAREN = Pattern(r'[^\s\,]+', 'entry key')
|
|
KEY_BRACE = Pattern(r'[^\s\,}]+', 'entry key')
|
|
NUMBER = Pattern(r'[{0}]+'.format(digits), 'a number')
|
|
LBRACE = Literal(u'{')
|
|
RBRACE = Literal(u'}')
|
|
LPAREN = Literal(u'(')
|
|
RPAREN = Literal(u')')
|
|
QUOTE = Literal(u'"')
|
|
COMMA = Literal(u',')
|
|
EQUALS = Literal(u'=')
|
|
HASH = Literal(u'#')
|
|
AT = Literal(u'@')
|
|
|
|
command_start = None
|
|
current_command = None
|
|
current_entry_key = None
|
|
current_fields = None
|
|
current_field_name = None
|
|
current_field_value = None
|
|
|
|
def __init__(
|
|
self, text,
|
|
keyless_entries=False,
|
|
macros=month_names,
|
|
handle_error=None,
|
|
want_entry=None,
|
|
filename=None
|
|
):
|
|
super(LowLevelParser, self).__init__(text, filename)
|
|
self.keyless_entries = keyless_entries
|
|
self.macros = macros
|
|
if handle_error:
|
|
self.handle_error = handle_error
|
|
if want_entry:
|
|
self.want_entry = want_entry
|
|
|
|
def __iter__(self):
|
|
return self.parse_bibliography()
|
|
|
|
def get_error_context_info(self):
|
|
return self.command_start, self.lineno, self.pos
|
|
|
|
def get_error_context(self, context_info):
|
|
error_start, lineno, error_pos = context_info
|
|
before_error = self.text[error_start:error_pos]
|
|
if not before_error.endswith('\n'):
|
|
eol = self.NEWLINE.search(self.text, error_pos)
|
|
error_end = eol.end() if eol else self.end_pos
|
|
else:
|
|
error_end = error_pos
|
|
context = self.text[error_start:error_end].rstrip('\r\n')
|
|
colno = len(before_error.splitlines()[-1])
|
|
return context, lineno, colno
|
|
|
|
def handle_error(self, error):
|
|
raise error
|
|
|
|
def want_entry(self, key):
|
|
return True
|
|
|
|
def want_current_entry(self):
|
|
return self.current_entry_key is None or self.want_entry(self.current_entry_key)
|
|
|
|
def parse_bibliography(self):
|
|
while True:
|
|
if not self.skip_to([self.AT]):
|
|
return
|
|
self.command_start = self.pos - 1
|
|
try:
|
|
yield tuple(self.parse_command())
|
|
except PybtexSyntaxError as error:
|
|
self.handle_error(error)
|
|
except SkipEntry:
|
|
pass
|
|
|
|
def parse_command(self):
|
|
self.current_entry_key = None
|
|
self.current_fields = []
|
|
self.current_field_name = None
|
|
self.current_value = []
|
|
|
|
name = self.required([self.NAME])
|
|
command = name.value
|
|
body_start = self.required([self.LPAREN, self.LBRACE])
|
|
body_end = self.RBRACE if body_start.pattern == self.LBRACE else self.RPAREN
|
|
|
|
command_lower = command.lower()
|
|
if command_lower == 'string':
|
|
parse_body = self.parse_string_body
|
|
make_result = lambda: (command, (self.current_field_name, self.current_value))
|
|
elif command_lower == 'preamble':
|
|
parse_body = self.parse_preamble_body
|
|
make_result = lambda: (command, (self.current_value,))
|
|
elif command_lower == 'comment':
|
|
raise SkipEntry
|
|
else:
|
|
parse_body = self.parse_entry_body
|
|
make_result = lambda: (command, (self.current_entry_key, self.current_fields))
|
|
try:
|
|
parse_body(body_end)
|
|
self.required([body_end])
|
|
except PybtexSyntaxError as error:
|
|
self.handle_error(error)
|
|
return make_result()
|
|
|
|
def parse_preamble_body(self, body_end):
|
|
self.parse_value()
|
|
|
|
def parse_string_body(self, body_end):
|
|
self.current_field_name = self.required([self.NAME]).value
|
|
self.required([self.EQUALS])
|
|
self.parse_value()
|
|
self.macros[self.current_field_name] = ''.join(self.current_value)
|
|
|
|
def parse_entry_body(self, body_end):
|
|
if not self.keyless_entries:
|
|
key_pattern = self.KEY_PAREN if body_end == self.RPAREN else self.KEY_BRACE
|
|
self.current_entry_key = self.required([key_pattern]).value
|
|
self.parse_entry_fields()
|
|
if not self.want_current_entry():
|
|
raise SkipEntry
|
|
|
|
def parse_entry_fields(self):
|
|
while True:
|
|
self.current_field_name = None
|
|
self.current_value = []
|
|
self.parse_field()
|
|
if self.current_field_name and self.current_value:
|
|
self.current_fields.append((self.current_field_name, self.current_value))
|
|
comma = self.optional([self.COMMA])
|
|
if not comma:
|
|
return
|
|
|
|
def parse_field(self):
|
|
name = self.optional([self.NAME])
|
|
if not name:
|
|
return
|
|
self.current_field_name = name.value
|
|
self.required([self.EQUALS])
|
|
self.parse_value()
|
|
|
|
def parse_value(self):
|
|
start = True
|
|
concatenation = False
|
|
value_parts = []
|
|
while True:
|
|
if not start:
|
|
concatenation = self.optional([self.HASH])
|
|
if not (start or concatenation):
|
|
break
|
|
value_parts.append(self.parse_value_part())
|
|
start = False
|
|
self.current_value = value_parts
|
|
|
|
def parse_value_part(self):
|
|
token = self.required(
|
|
[self.QUOTE, self.LBRACE, self.NUMBER, self.NAME],
|
|
description='field value',
|
|
)
|
|
if token.pattern is self.QUOTE:
|
|
value_part = self.flatten_string(self.parse_string(string_end=self.QUOTE))
|
|
elif token.pattern is self.LBRACE:
|
|
value_part = self.flatten_string(self.parse_string(string_end=self.RBRACE))
|
|
elif token.pattern is self.NUMBER:
|
|
value_part = token.value
|
|
else:
|
|
value_part = self.substitute_macro(token.value)
|
|
return value_part
|
|
|
|
def flatten_string(self, parts):
|
|
return ''.join(part.value for part in parts)[:-1]
|
|
|
|
def substitute_macro(self, name):
|
|
try:
|
|
return self.macros[name]
|
|
except KeyError:
|
|
if self.want_current_entry():
|
|
self.handle_error(UndefinedMacro(name, self))
|
|
return ''
|
|
|
|
def parse_string(self, string_end, level=0, max_level=100):
|
|
if level > max_level:
|
|
raise PybtexSyntaxError('too many nested braces', self)
|
|
|
|
special_chars = [self.RBRACE, self.LBRACE]
|
|
if string_end is self.QUOTE:
|
|
special_chars = [self.QUOTE] + special_chars
|
|
while True:
|
|
part = self.skip_to(special_chars)
|
|
if not part:
|
|
raise PrematureEOF(self)
|
|
if part.pattern is string_end:
|
|
yield part
|
|
break
|
|
elif part.pattern is self.LBRACE:
|
|
yield part
|
|
for subpart in self.parse_string(self.RBRACE, level + 1):
|
|
yield subpart
|
|
elif part.pattern is self.RBRACE and level == 0:
|
|
raise PybtexSyntaxError('unbalanced braces', self)
|
|
|
|
|
|
class BibTeXEntryIterator(LowLevelParser):
|
|
def __init__(self, *args, **kwargs):
|
|
import warnings
|
|
message = 'BibTeXEntryIterator is deprecated since 0.22: renamed to LowLevelParser'
|
|
warnings.warn(message, DeprecationWarning, stacklevel=2)
|
|
super(BibTeXEntryIterator, self).__init__(*args, **kwargs)
|
|
|
|
|
|
class Parser(BaseParser):
|
|
default_suffix = '.bib'
|
|
unicode_io = True
|
|
|
|
macros = None
|
|
|
|
def __init__(
|
|
self,
|
|
encoding=None,
|
|
macros=month_names,
|
|
person_fields=Person.valid_roles,
|
|
keyless_entries=False,
|
|
**kwargs
|
|
):
|
|
BaseParser.__init__(self, encoding, **kwargs)
|
|
|
|
self.macros = CaseInsensitiveDict(macros)
|
|
self.person_fields = CaseInsensitiveSet(person_fields)
|
|
self.keyless_entries = keyless_entries
|
|
|
|
def process_entry(self, entry_type, key, fields):
|
|
entry = Entry(entry_type)
|
|
|
|
if key is None:
|
|
key = 'unnamed-%i' % self.unnamed_entry_counter
|
|
self.unnamed_entry_counter += 1
|
|
|
|
seen_fields = set()
|
|
for field_name, field_value_list in fields:
|
|
if field_name.lower() in seen_fields:
|
|
self.handle_error(DuplicateField(key, field_name))
|
|
continue
|
|
|
|
field_value = textutils.normalize_whitespace(self.flatten_value_list(field_value_list))
|
|
if field_name in self.person_fields:
|
|
for name in split_name_list(field_value):
|
|
entry.add_person(Person(name), field_name)
|
|
else:
|
|
entry.fields[field_name] = field_value
|
|
seen_fields.add(field_name.lower())
|
|
self.data.add_entry(key, entry)
|
|
|
|
def process_preamble(self, value_list):
|
|
value = textutils.normalize_whitespace(self.flatten_value_list(value_list))
|
|
self.data.add_to_preamble(value)
|
|
|
|
def flatten_value_list(self, value_list):
|
|
return ''.join(value_list)
|
|
|
|
def handle_error(self, error):
|
|
from pybtex.errors import report_error
|
|
report_error(error)
|
|
|
|
def parse_string(self, text):
|
|
self.unnamed_entry_counter = 1
|
|
self.command_start = 0
|
|
|
|
entry_iterator = LowLevelParser(
|
|
text,
|
|
keyless_entries=self.keyless_entries,
|
|
handle_error=self.handle_error,
|
|
want_entry=self.data.want_entry,
|
|
filename=self.filename,
|
|
macros=self.macros,
|
|
)
|
|
for entry in entry_iterator:
|
|
entry_type = entry[0]
|
|
entry_type_lower = entry_type.lower()
|
|
if entry_type_lower == 'string':
|
|
pass
|
|
elif entry_type_lower == 'preamble':
|
|
self.process_preamble(*entry[1])
|
|
else:
|
|
self.process_entry(entry_type, *entry[1])
|
|
return self.data
|
|
|
|
def parse_stream(self, stream):
|
|
text = stream.read()
|
|
return self.parse_string(text)
|
|
|