Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix pullquote processing #727

Merged
merged 1 commit into from
Jun 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 25 additions & 14 deletions content_migration/management/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from dataclasses import dataclass
from io import BytesIO
from itertools import chain
import re
from urllib.parse import urlparse

import requests
Expand Down Expand Up @@ -175,21 +174,23 @@ def create_block(generic_block: GenericBlock) -> tuple[str, str | dict]:


def remove_pullquote_tags(item_string: str) -> str:
"""Remove "[pullquote]" and "[/pullquote]" tags in string.
"""Remove the span with class 'pullquote' from a string, preserving the
contents of the span.

https://stackoverflow.com/a/44593228/1191545
"""
Note:
There may be multiple pullquotes in a string.

replacement_values: list = [
("[pullquote]", ""),
("[/pullquote]", ""),
]
Warning:
There could be other spans that we don't want to remove, so we
need to be careful to only remove the pullquote spans.
"""

if item_string != "":
for replacement_value in replacement_values:
item_string = item_string.replace(*replacement_value)
# Remove the pullquote spans
soup = BeautifulSoup(item_string, "html.parser")
for pullquote in soup.find_all("span", {"class": "pullquote"}):
pullquote.unwrap()

return item_string
return str(soup)


def adapt_html_to_generic_blocks(html_string: str) -> list[GenericBlock]:
Expand Down Expand Up @@ -349,9 +350,19 @@ def create_image_block_from_file_bytes(


def extract_pullquotes(item: str) -> list[str]:
"""Get a list of all pullquote strings found within the item."""
"""Get a list of all pullquote strings found within the item, excluding the
pullquote spans.

The pullquote strings are wrapped in a span with class 'pullquote'.
"""

pullquotes = []

soup = BeautifulSoup(item, "html.parser")
for pullquote in soup.find_all("span", {"class": "pullquote"}):
pullquotes.append(pullquote.string)

return re.findall(r"\[pullquote\](.+?)\[\/pullquote\]", item) # type: ignore
return pullquotes


def fetch_file_bytes(url: str) -> FileBytesWithMimeType:
Expand Down
81 changes: 68 additions & 13 deletions content_migration/management/test_shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,13 @@ def test_extract_image_urls(self) -> None:
self.assertEqual(output_images, expected_images)


class ParseBodyBlocksTestCase(TestCase):
def test_parse_body_blocks(self) -> None:
class ParseBodyBlocksSimpleTestCase(SimpleTestCase):
def test_parse_body_blocks_with_pullquote(self) -> None:
self.MaxDiff = None
input_html = """<p>Some text[pullquote]with a pullquote[/pullquote]</p>"""
expected_output_html = """<p>Some textwith a pullquote</p>"""
input_html = (
"""<p>Some text <span class="pullquote">with a pullquote</span></p>"""
)
expected_output_html = """<p>Some text with a pullquote</p>"""

output_blocks = parse_body_blocks(input_html)
expected_blocks = [
Expand Down Expand Up @@ -116,8 +118,57 @@ def test_parse_body_blocks(self) -> None:

def test_parse_body_blocks_with_multiple_pullquotes(self) -> None:
self.MaxDiff = None
input_html = """<p>Some text[pullquote]with a pullquote[/pullquote] and [pullquote]another pullquote[/pullquote].</p>""" # noqa: E501
parse_body_blocks(input_html)
input_html = """<p>Some text <span class="pullquote">with a pullquote</span> and <span class="pullquote">another pullquote</span>.</p>""" # noqa: E501
output_blocks = parse_body_blocks(input_html)

expected_blocks = [
(
"pullquote",
"with a pullquote",
),
(
"pullquote",
"another pullquote",
),
(
"rich_text",
RichText(
"""<p>Some text with a pullquote and another pullquote.</p>"""
),
),
]

# Make sure first pullquote block matches the first expected pullquote block
self.assertEqual(
output_blocks[0][0],
expected_blocks[0][0],
)
self.assertEqual(
output_blocks[0][1],
expected_blocks[0][1],
)

# Make sure second output block matches second expected block
self.assertEqual(
output_blocks[1][0],
expected_blocks[1][0],
)
self.assertEqual(
output_blocks[1][1],
expected_blocks[1][1],
)

# Make sure third output block matches third expected block
# by comparing the .source attribute on the RichText objects
# and the block type is rich_text
self.assertEqual(
output_blocks[2][1].source,
expected_blocks[2][1].source,
)
self.assertEqual(
output_blocks[2][0],
expected_blocks[2][0],
)

def test_parse_body_blocks_with_multiple_consecutive_paragraphs(self) -> None:
self.MaxDiff = None
Expand All @@ -144,7 +195,7 @@ def test_parse_body_blocks_with_multiple_consecutive_paragraphs(self) -> None:

def test_parse_body_blocks_with_multiple_paragraphs_and_a_pullquote(self) -> None:
self.MaxDiff = None
input_html = """<p>One paragraph.</p><p>Another paragraph.</p><p>[pullquote]A pullquote[/pullquote] inside of a paragraph</p>""" # noqa: E501
input_html = """<p>One paragraph.</p><p>Another paragraph.</p><p><span class="pullquote">A pullquote</span> inside of a paragraph</p>""" # noqa: E501

output_blocks = parse_body_blocks(input_html)
expected_blocks = [
Expand Down Expand Up @@ -552,15 +603,17 @@ def test_create_archive_issues_from_articles_dicts(self) -> None:

class RemovePullquoteTagsSimpleTestCase(SimpleTestCase):
def test_remove_pullquote_tags(self) -> None:
input_html = """<p>Some text[pullquote]with a pullquote[/pullquote]</p>"""
input_html = (
"""<p>Some text <span class="pullquote">with a pullquote</span></p>"""
)
output_html = remove_pullquote_tags(input_html)

expected_html = """<p>Some textwith a pullquote</p>"""
expected_html = """<p>Some text with a pullquote</p>"""

self.assertEqual(output_html, expected_html)

def test_remove_pullquote_tags_with_multiple_pullquotes(self) -> None:
input_html = """<p>Some text [pullquote]with a pullquote[/pullquote] and another [pullquote]with a pullquote[/pullquote]</p>""" # noqa: E501
input_html = """<p>Some text <span class="pullquote">with a pullquote</span> and another <span class="pullquote">with a pullquote</span></p>""" # noqa: E501
output_html = remove_pullquote_tags(input_html)
expected_html = """<p>Some text with a pullquote and another with a pullquote</p>""" # noqa: E501

Expand All @@ -569,13 +622,15 @@ def test_remove_pullquote_tags_with_multiple_pullquotes(self) -> None:

class ExtractPullquotesSimpleTestCase(SimpleTestCase):
def test_extract_pullquotes(self) -> None:
input_html = """<p>Some text[pullquote]with a pullquote[/pullquote]</p>"""
input_html = (
"""<p>Some text<span class="pullquote">with a pullquote</span></p>"""
)
output_pullquotes = extract_pullquotes(input_html)
expected_pullquotes = ["with a pullquote"]
self.assertEqual(output_pullquotes, expected_pullquotes)

def test_extract_pullquotes_with_multiple_pullquotes(self) -> None:
input_html = """<p>Some text[pullquote]with a pullquote[/pullquote] and another [pullquote]with a pullquote[/pullquote]</p>""" # noqa: E501
input_html = """<p>Some text<span class="pullquote">with a pullquote</span> and another <span class="pullquote">with a pullquote</span></p>""" # noqa: E501
output_pullquotes = extract_pullquotes(input_html)
expected_pullquotes = ["with a pullquote", "with a pullquote"]
self.assertEqual(output_pullquotes, expected_pullquotes)
Expand All @@ -598,7 +653,7 @@ def test_adapt_html_to_generic_blocks(self) -> None:
self.assertEqual(generic_blocks[0].block_content, html_string)

def test_adapt_html_to_generic_blocks_with_pullquote(self) -> None:
html_string = """<p>Some text</p><p>Some more text</p><p>A paragraph with [pullquote]a pullquote[/pullquote] that should be extracted</p>""" # noqa: E501
html_string = """<p>Some text</p><p>Some more text</p><p>A paragraph with <span class="pullquote">a pullquote</span> that should be extracted</p>""" # noqa: E501

generic_blocks = adapt_html_to_generic_blocks(html_string)

Expand Down