Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions mwxml/element_iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,12 @@ def __init__(self, element, pointer):
def __iter__(self):

while not self.done and self.pointer.depth() > self.depth:
event, element = next(self.pointer)
try:
event, element = next(self.pointer)
except StopIteration:
# Stream exhausted - this is normal completion
# PEP 479: Catch StopIteration to prevent it from escaping the generator
break

if event == "start":
sub_iterator = ElementIterator(element, self.pointer)
Expand All @@ -69,7 +74,12 @@ def __iter__(self):
def complete(self):

while not self.done and self.pointer.depth() > self.depth:
event, element = next(self.pointer)
try:
event, element = next(self.pointer)
except StopIteration:
# Stream exhausted - this is normal completion
# PEP 479: Catch StopIteration to prevent it from escaping the generator
break
if self.pointer.depth() > self.depth:
element.clear()

Expand Down
202 changes: 202 additions & 0 deletions mwxml/iteration/tests/test_stopiteration_bug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
"""
Test suite for StopIteration bug fix (PEP 479 compatibility).

This module tests that the mwxml library properly handles StopIteration
exceptions in Python 3.7+ where PEP 479 converts StopIteration raised
inside generators to RuntimeError.
"""

import io
import pytest
import sys

from ..dump import Dump


# Sample XML with valid MediaWiki structure
MINIMAL_XML = """
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/">
<siteinfo>
<sitename>Wikipedia</sitename>
<dbname>enwiki</dbname>
</siteinfo>
<page>
<title>Test Page</title>
<ns>0</ns>
<id>1</id>
<revision>
<id>100</id>
<timestamp>2021-01-01T00:00:00Z</timestamp>
<text>Test content</text>
</revision>
</page>
</mediawiki>
"""

MULTI_PAGE_XML = """
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/">
<siteinfo>
<sitename>Wikipedia</sitename>
<dbname>enwiki</dbname>
</siteinfo>
<page>
<title>Page 1</title>
<ns>0</ns>
<id>1</id>
<revision>
<id>100</id>
<timestamp>2021-01-01T00:00:00Z</timestamp>
<text>Content 1</text>
</revision>
</page>
<page>
<title>Page 2</title>
<ns>0</ns>
<id>2</id>
<revision>
<id>200</id>
<timestamp>2021-01-02T00:00:00Z</timestamp>
<text>Content 2</text>
</revision>
</page>
</mediawiki>
"""


@pytest.mark.skipif(sys.version_info < (3, 7),
reason="Bug only affects Python 3.7+")
def test_stopiteration_bug_reproduction():
"""
Reproduce the StopIteration RuntimeError bug in Python 3.7+.

This test demonstrates the bug that occurs when the XML stream is
exhausted and StopIteration propagates through a generator, which
PEP 479 converts to RuntimeError.

NOTE: This test is expected to FAIL before the fix is applied and
PASS after the fix is applied.
"""
dump = Dump.from_file(io.StringIO(MINIMAL_XML))

# Before the fix, this should raise RuntimeError in Python 3.7+
# After the fix, this should complete normally
try:
pages = list(dump)
# If we get here, the fix is working
assert len(pages) == 1
assert pages[0].title == "Test Page"
except RuntimeError as e:
if "generator raised StopIteration" in str(e):
pytest.fail(
"Bug reproduced: RuntimeError raised due to StopIteration in generator. "
"The fix has not been applied yet."
)
else:
# Some other RuntimeError, re-raise it
raise


def test_iteration_completes_normally():
"""
Verify iteration completes without RuntimeError after fix.

This test verifies that the fix properly handles stream exhaustion
and allows iteration to complete normally.
"""
dump = Dump.from_file(io.StringIO(MINIMAL_XML))

# Should complete without raising RuntimeError
pages = list(dump)

# Verify the page was extracted
assert len(pages) == 1
assert pages[0].title == "Test Page"
assert pages[0].id == 1
assert pages[0].namespace == 0


def test_multiple_pages_iteration():
"""
Verify iteration works correctly with multiple pages.

Tests that the fix doesn't break normal iteration over multiple
pages in the XML dump.
"""
dump = Dump.from_file(io.StringIO(MULTI_PAGE_XML))

# Should complete without raising RuntimeError
pages = list(dump)

# Verify all pages were extracted
assert len(pages) == 2
assert pages[0].title == "Page 1"
assert pages[0].id == 1
assert pages[1].title == "Page 2"
assert pages[1].id == 2


def test_iteration_with_generator_pattern():
"""
Verify the fix works with generator iteration pattern.

This tests that the fix works when using the dump as a generator
rather than converting to a list.
"""
dump = Dump.from_file(io.StringIO(MULTI_PAGE_XML))

# Iterate using generator pattern
page_titles = []
for page in dump:
page_titles.append(page.title)

# Should have collected all pages
assert page_titles == ["Page 1", "Page 2"]


def test_empty_dump_iteration():
"""
Verify iteration handles empty dumps correctly.

Tests edge case where there are no pages in the dump.
"""
empty_xml = """
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/">
<siteinfo>
<sitename>Wikipedia</sitename>
<dbname>enwiki</dbname>
</siteinfo>
</mediawiki>
"""

dump = Dump.from_file(io.StringIO(empty_xml))

# Should complete without error
pages = list(dump)

# Should have no pages
assert len(pages) == 0


def test_partial_iteration():
"""
Verify partial iteration doesn't cause issues.

Tests that breaking out of iteration early doesn't cause problems
with the StopIteration handling.
"""
dump = Dump.from_file(io.StringIO(MULTI_PAGE_XML))

# Only iterate over first page
first_page = None
for page in dump:
first_page = page
break

# Should have gotten the first page
assert first_page is not None
assert first_page.title == "Page 1"


if __name__ == "__main__":
# Allow running tests directly
pytest.main([__file__, "-v"])