Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 28 additions & 3 deletions bibtexparser/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,24 @@ def _reset_block_status(self, current_char_index: int) -> None:
self._implicit_comment_start_line = self._current_line
self._implicit_comment_start: Optional[int] = current_char_index

def _is_at_line_start(self, pos: int) -> bool:
"""Check if position is at the start of a line (after optional whitespace).

This is used to determine whether an @ sign should be treated as a new
block start (for error recovery) or as content within a field value.
We only want to abort parsing and start a new block if the @ is at the
beginning of a line, to avoid false positives with @ signs in content.
"""
# Scan backwards from pos to find either newline or non-whitespace
for i in range(pos - 1, -1, -1):
char = self.bibstr[i]
if char == "\n":
return True
elif not char.isspace():
return False
# Start of string counts as line start
return True

def _end_implicit_comment(self, end_char_index) -> Optional[ImplicitComment]:
if self._implicit_comment_start is None:
return # No implicit comment started
Expand Down Expand Up @@ -122,7 +140,11 @@ def _move_to_closed_bracket(self) -> int:
return m.start()
else:
num_additional_brackets -= 1
elif m.group(0).startswith("@"):
elif m.group(0).startswith("@") and self._is_at_line_start(m.start()):
# Only abort if the @ is at the start of a line.
# This allows @ signs in field values (e.g., "LeQua @ {CLEF}")
# while still providing error recovery when a new block starts
# on a new line within an unclosed block.
self._unaccepted_mark = m
raise BlockAbortedException(
abort_reason=f"Unexpected block start: `{m.group(0)}`. "
Expand Down Expand Up @@ -169,8 +191,11 @@ def _is_escaped():
self._unaccepted_mark = next_mark
return next_mark.start()

# Sanity-check: If new block is starting, we abort
elif next_mark.group(0).startswith("@"):
# Sanity-check: If new block is starting at line start, we abort.
# We only abort if the @ is at the start of a line to allow @ signs
# in field values (e.g., "LeQua @ {CLEF}") while still providing
# error recovery when a new block starts on a new line.
elif next_mark.group(0).startswith("@") and self._is_at_line_start(next_mark.start()):
self._unaccepted_mark = next_mark

if currently_quote_escaped:
Expand Down
293 changes: 293 additions & 0 deletions tests/splitter_tests/test_splitter_block_start_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
"""Tests for block start detection behavior.

These tests verify the fix for issue #488 and the tradeoffs discussed in PR #416:
- @ signs in field values should not be treated as new block starts
- Multiple blocks on the same line should be parsed correctly
- Error recovery should still work when a new block starts at line start
"""

from textwrap import dedent

import pytest

from bibtexparser.splitter import Splitter

# =============================================================================
# Test: @ signs in field values (issue #488)
# =============================================================================


@pytest.mark.parametrize(
"bibtex_str,expected_key,expected_field,expected_substring",
[
pytest.param(
dedent(
"""\
@inproceedings{DBLP:conf/cikm/EsuliM021,
author = {Andrea Esuli},
title = {LeQua @ {CLEF} 2022: {A} Shared Task},
year = {2021}
}"""
),
"DBLP:conf/cikm/EsuliM021",
"title",
"@ {CLEF}",
id="at_sign_space_brace_in_title",
),
pytest.param(
"@article{test, email = {[email protected]}}",
"test",
"email",
"[email protected]",
id="email_address_in_braces",
),
pytest.param(
'@article{test, email = "[email protected]"}',
"test",
"email",
"[email protected]",
id="email_address_in_quotes",
),
pytest.param(
"@article{test, note = {Contact [email protected] or [email protected]}}",
"test",
"note",
"[email protected]",
id="multiple_at_signs",
),
pytest.param(
"@article{test, title = {Workshop @ {ICML} 2023}}",
"test",
"title",
"@ {ICML}",
id="at_sign_followed_by_brace",
),
pytest.param(
'@article{test, title = "BibTeX entries start with @article{"}',
"test",
"title",
"@article{",
id="literal_at_entry_in_quotes",
),
pytest.param(
# Note: 3 closing braces - inner {}, title field, entry
"@article{test, title = {BibTeX entries start with @article{}}}",
"test",
"title",
"@article{",
id="literal_at_entry_in_braces",
),
],
)
def test_at_sign_in_field_value(
bibtex_str: str, expected_key: str, expected_field: str, expected_substring: str
):
"""@ signs in field values should be parsed as content, not block starts."""
library = Splitter(bibtex_str).split()

assert len(library.failed_blocks) == 0
assert len(library.entries) == 1
assert library.entries[0].key == expected_key
assert expected_substring in library.entries[0][expected_field]


# =============================================================================
# Test: Multiple blocks on the same line
# =============================================================================


@pytest.mark.parametrize(
"bibtex_str,expected_entry_keys",
[
pytest.param(
"@article{key1, title={A}} @book{key2, title={B}}",
["key1", "key2"],
id="two_entries_with_space",
),
pytest.param(
"@article{key1,title={A}}@book{key2,title={B}}",
["key1", "key2"],
id="two_entries_no_space",
),
pytest.param(
"@article{a, x={1}} @book{b, y={2}} @misc{c, z={3}}",
["a", "b", "c"],
id="three_entries",
),
],
)
def test_multiple_entries_same_line(bibtex_str: str, expected_entry_keys: list):
"""Multiple well-formed entries on the same line should all be parsed."""
library = Splitter(bibtex_str).split()

assert len(library.failed_blocks) == 0
assert len(library.entries) == len(expected_entry_keys)
assert [e.key for e in library.entries] == expected_entry_keys


@pytest.mark.parametrize(
"bibtex_str,expected_entries,expected_strings,expected_comments",
[
pytest.param(
'@article{key1, title={A}} @string{mystr = "value"}',
1,
1,
0,
id="entry_and_string",
),
pytest.param(
"@article{key1, title={A}} @comment{A comment}",
1,
0,
1,
id="entry_and_comment",
),
],
)
def test_mixed_blocks_same_line(
bibtex_str: str, expected_entries: int, expected_strings: int, expected_comments: int
):
"""Different block types on the same line should all be parsed."""
library = Splitter(bibtex_str).split()

assert len(library.failed_blocks) == 0
assert len(library.entries) == expected_entries
assert len(library.strings) == expected_strings
assert len(library.comments) == expected_comments


# =============================================================================
# Test: Error recovery when new block starts at line start
# =============================================================================


@pytest.mark.parametrize(
"bibtex_str,expected_valid_key",
[
pytest.param(
dedent(
"""\
@article{broken, title={Unclosed
@article{valid, title={Valid Entry}}"""
),
"valid",
id="unclosed_entry_field",
),
pytest.param(
dedent(
"""\
@string{broken = {unclosed value
@article{valid, title={Valid Entry}}"""
),
"valid",
id="unclosed_string",
),
pytest.param(
dedent(
"""\
@article{broken, title={Unclosed
@article{valid, title={Valid Entry}}"""
),
"valid",
id="indented_new_block",
),
],
)
def test_error_recovery_at_line_start(bibtex_str: str, expected_valid_key: str):
"""New block at line start should trigger recovery from malformed block."""
library = Splitter(bibtex_str).split()

assert len(library.failed_blocks) == 1
assert len(library.entries) == 1
assert library.entries[0].key == expected_valid_key


def test_error_recovery_preserves_failed_block_raw():
"""The failed block should contain raw text up to where recovery started."""
bibtex_str = dedent(
"""\
@article{broken, title={This is unclosed
@article{valid, title={OK}}"""
)
library = Splitter(bibtex_str).split()

assert len(library.failed_blocks) == 1
failed = library.failed_blocks[0]
assert "broken" in failed.raw
assert "This is unclosed" in failed.raw


# =============================================================================
# Test: No false recovery for @ mid-line
# =============================================================================


@pytest.mark.parametrize(
"bibtex_str",
[
pytest.param(
"@article{test, title={unclosed @misc{fake}",
id="at_entry_mid_line",
),
pytest.param(
"@article{test, title={text @ {more} unclosed",
id="at_brace_mid_line",
),
],
)
def test_no_false_recovery_mid_line(bibtex_str: str):
"""@ mid-line should not trigger false error recovery."""
library = Splitter(bibtex_str).split()

# Should fail as one block, no recovery
assert len(library.failed_blocks) == 1
assert len(library.entries) == 0


# =============================================================================
# Test: Edge cases
# =============================================================================


@pytest.mark.parametrize(
"bibtex_str",
[
pytest.param(
"@article{test, title={Hello}}",
id="block_at_file_start",
),
pytest.param(
" \t @article{test, title={Hello}}",
id="block_after_whitespace_only",
),
pytest.param(
"@article{test, title={L1 {L2 {[email protected]} back} done}}",
id="nested_braces_with_at",
),
],
)
def test_edge_cases_entries(bibtex_str: str):
"""Various edge cases should parse without failure."""
library = Splitter(bibtex_str).split()

assert len(library.failed_blocks) == 0
assert len(library.entries) == 1


def test_preamble_with_at_sign():
"""@ sign inside a preamble block."""
bibtex_str = '@preamble{"Contact: [email protected]"}'
library = Splitter(bibtex_str).split()

assert len(library.failed_blocks) == 0
assert len(library.preambles) == 1


def test_explicit_comment_with_at_sign():
"""@ sign inside an explicit comment block."""
bibtex_str = "@comment{Email: [email protected]}"
library = Splitter(bibtex_str).split()

assert len(library.failed_blocks) == 0
assert len(library.comments) == 1
assert "[email protected]" in library.comments[0].comment
Loading