WesternFriend · brylie · Jun 19, 2023 · Jun 19, 2023
diff --git a/content_migration/management/shared.py b/content_migration/management/shared.py
@@ -6,7 +6,6 @@
 from dataclasses import dataclass
 from io import BytesIO
 from itertools import chain
-import re
 from urllib.parse import urlparse
 
 import requests
@@ -175,21 +174,23 @@ def create_block(generic_block: GenericBlock) -> tuple[str, str | dict]:
 
 
 def remove_pullquote_tags(item_string: str) -> str:
-    """Remove "[pullquote]" and "[/pullquote]" tags in string.
+    """Remove the span with class 'pullquote' from a string, preserving the
+    contents of the span.
 
-    https://stackoverflow.com/a/44593228/1191545
-    """
+    Note:
+    There may be multiple pullquotes in a string.
 
-    replacement_values: list = [
-        ("[pullquote]", ""),
-        ("[/pullquote]", ""),
-    ]
+    Warning:
+    There could be other spans that we don't want to remove, so we
+    need to be careful to only remove the pullquote spans.
+    """
 
-    if item_string != "":
-        for replacement_value in replacement_values:
-            item_string = item_string.replace(*replacement_value)
+    # Remove the pullquote spans
+    soup = BeautifulSoup(item_string, "html.parser")
+    for pullquote in soup.find_all("span", {"class": "pullquote"}):
+        pullquote.unwrap()
 
-    return item_string
+    return str(soup)
 
 
 def adapt_html_to_generic_blocks(html_string: str) -> list[GenericBlock]:
@@ -349,9 +350,19 @@ def create_image_block_from_file_bytes(
 
 
 def extract_pullquotes(item: str) -> list[str]:
-    """Get a list of all pullquote strings found within the item."""
+    """Get a list of all pullquote strings found within the item, excluding the
+    pullquote spans.
+
+    The pullquote strings are wrapped in a span with class 'pullquote'.
+    """
+
+    pullquotes = []
+
+    soup = BeautifulSoup(item, "html.parser")
+    for pullquote in soup.find_all("span", {"class": "pullquote"}):
+        pullquotes.append(pullquote.string)
 
-    return re.findall(r"\[pullquote\](.+?)\[\/pullquote\]", item)  # type: ignore
+    return pullquotes
 
 
 def fetch_file_bytes(url: str) -> FileBytesWithMimeType:

diff --git a/content_migration/management/test_shared.py b/content_migration/management/test_shared.py
@@ -78,11 +78,13 @@ def test_extract_image_urls(self) -> None:
         self.assertEqual(output_images, expected_images)
 
 
-class ParseBodyBlocksTestCase(TestCase):
-    def test_parse_body_blocks(self) -> None:
+class ParseBodyBlocksSimpleTestCase(SimpleTestCase):
+    def test_parse_body_blocks_with_pullquote(self) -> None:
         self.MaxDiff = None
-        input_html = """<p>Some text[pullquote]with a pullquote[/pullquote]</p>"""
-        expected_output_html = """<p>Some textwith a pullquote</p>"""
+        input_html = (
+            """<p>Some text <span class="pullquote">with a pullquote</span></p>"""
+        )
+        expected_output_html = """<p>Some text with a pullquote</p>"""
 
         output_blocks = parse_body_blocks(input_html)
         expected_blocks = [
@@ -116,8 +118,57 @@ def test_parse_body_blocks(self) -> None:
 
     def test_parse_body_blocks_with_multiple_pullquotes(self) -> None:
         self.MaxDiff = None
-        input_html = """<p>Some text[pullquote]with a pullquote[/pullquote] and [pullquote]another pullquote[/pullquote].</p>"""  # noqa: E501
-        parse_body_blocks(input_html)
+        input_html = """<p>Some text <span class="pullquote">with a pullquote</span> and <span class="pullquote">another pullquote</span>.</p>"""  # noqa: E501
+        output_blocks = parse_body_blocks(input_html)
+
+        expected_blocks = [
+            (
+                "pullquote",
+                "with a pullquote",
+            ),
+            (
+                "pullquote",
+                "another pullquote",
+            ),
+            (
+                "rich_text",
+                RichText(
+                    """<p>Some text with a pullquote and another pullquote.</p>"""
+                ),
+            ),
+        ]
+
+        # Make sure first pullquote block matches the first expected pullquote block
+        self.assertEqual(
+            output_blocks[0][0],
+            expected_blocks[0][0],
+        )
+        self.assertEqual(
+            output_blocks[0][1],
+            expected_blocks[0][1],
+        )
+
+        # Make sure second output block matches second expected block
+        self.assertEqual(
+            output_blocks[1][0],
+            expected_blocks[1][0],
+        )
+        self.assertEqual(
+            output_blocks[1][1],
+            expected_blocks[1][1],
+        )
+
+        # Make sure third output block matches third expected block
+        # by comparing the .source attribute on the RichText objects
+        # and the block type is rich_text
+        self.assertEqual(
+            output_blocks[2][1].source,
+            expected_blocks[2][1].source,
+        )
+        self.assertEqual(
+            output_blocks[2][0],
+            expected_blocks[2][0],
+        )
 
     def test_parse_body_blocks_with_multiple_consecutive_paragraphs(self) -> None:
         self.MaxDiff = None
@@ -144,7 +195,7 @@ def test_parse_body_blocks_with_multiple_consecutive_paragraphs(self) -> None:
 
     def test_parse_body_blocks_with_multiple_paragraphs_and_a_pullquote(self) -> None:
         self.MaxDiff = None
-        input_html = """<p>One paragraph.</p><p>Another paragraph.</p><p>[pullquote]A pullquote[/pullquote] inside of a paragraph</p>"""  # noqa: E501
+        input_html = """<p>One paragraph.</p><p>Another paragraph.</p><p><span class="pullquote">A pullquote</span> inside of a paragraph</p>"""  # noqa: E501
 
         output_blocks = parse_body_blocks(input_html)
         expected_blocks = [
@@ -552,15 +603,17 @@ def test_create_archive_issues_from_articles_dicts(self) -> None:
 
 class RemovePullquoteTagsSimpleTestCase(SimpleTestCase):
     def test_remove_pullquote_tags(self) -> None:
-        input_html = """<p>Some text[pullquote]with a pullquote[/pullquote]</p>"""
+        input_html = (
+            """<p>Some text <span class="pullquote">with a pullquote</span></p>"""
+        )
         output_html = remove_pullquote_tags(input_html)
 
-        expected_html = """<p>Some textwith a pullquote</p>"""
+        expected_html = """<p>Some text with a pullquote</p>"""
 
         self.assertEqual(output_html, expected_html)
 
     def test_remove_pullquote_tags_with_multiple_pullquotes(self) -> None:
-        input_html = """<p>Some text [pullquote]with a pullquote[/pullquote] and another [pullquote]with a pullquote[/pullquote]</p>"""  # noqa: E501
+        input_html = """<p>Some text <span class="pullquote">with a pullquote</span> and another <span class="pullquote">with a pullquote</span></p>"""  # noqa: E501
         output_html = remove_pullquote_tags(input_html)
         expected_html = """<p>Some text with a pullquote and another with a pullquote</p>"""  # noqa: E501
 
@@ -569,13 +622,15 @@ def test_remove_pullquote_tags_with_multiple_pullquotes(self) -> None:
 
 class ExtractPullquotesSimpleTestCase(SimpleTestCase):
     def test_extract_pullquotes(self) -> None:
-        input_html = """<p>Some text[pullquote]with a pullquote[/pullquote]</p>"""
+        input_html = (
+            """<p>Some text<span class="pullquote">with a pullquote</span></p>"""
+        )
         output_pullquotes = extract_pullquotes(input_html)
         expected_pullquotes = ["with a pullquote"]
         self.assertEqual(output_pullquotes, expected_pullquotes)
 
     def test_extract_pullquotes_with_multiple_pullquotes(self) -> None:
-        input_html = """<p>Some text[pullquote]with a pullquote[/pullquote] and another [pullquote]with a pullquote[/pullquote]</p>"""  # noqa: E501
+        input_html = """<p>Some text<span class="pullquote">with a pullquote</span> and another <span class="pullquote">with a pullquote</span></p>"""  # noqa: E501
         output_pullquotes = extract_pullquotes(input_html)
         expected_pullquotes = ["with a pullquote", "with a pullquote"]
         self.assertEqual(output_pullquotes, expected_pullquotes)
@@ -598,7 +653,7 @@ def test_adapt_html_to_generic_blocks(self) -> None:
         self.assertEqual(generic_blocks[0].block_content, html_string)
 
     def test_adapt_html_to_generic_blocks_with_pullquote(self) -> None:
-        html_string = """<p>Some text</p><p>Some more text</p><p>A paragraph with [pullquote]a pullquote[/pullquote] that should be extracted</p>"""  # noqa: E501
+        html_string = """<p>Some text</p><p>Some more text</p><p>A paragraph with <span class="pullquote">a pullquote</span> that should be extracted</p>"""  # noqa: E501
 
         generic_blocks = adapt_html_to_generic_blocks(html_string)