Skip to content

Commit 45ee69d

Browse files
committed
changed xpaths List[str] to xpath str
1 parent f3f11e8 commit 45ee69d

File tree

2 files changed

+47
-48
lines changed

2 files changed

+47
-48
lines changed

archive_query_log/orm.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from datetime import datetime
22
from functools import cached_property
33
from re import Pattern, compile as pattern
4-
from typing import Literal, List
4+
from typing import Literal
55

66
from elasticsearch_dsl import Document, Keyword, Text, Date, RankFeature, \
77
InnerDoc as InnerDocument, Object, Index, Integer, Nested, Long, Boolean
@@ -459,7 +459,7 @@ class WarcDirectAnswerParser(BaseDocument):
459459
url_pattern_regex: str | None = Keyword()
460460
priority: float | None = RankFeature(positive_score_impact=True)
461461
parser_type: WarcDirectAnswerParserType = Keyword()
462-
xpaths: List[str] | None = Keyword()
462+
xpath: str | None = Keyword()
463463
url_xpath: str | None = Keyword()
464464
text_xpath: str | None = Keyword()
465465

archive_query_log/parsers/warc_direct_answers.py

+45-46
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from functools import cache
22
from itertools import chain
3-
from typing import Iterable, Iterator, List
3+
from typing import Iterable, Iterator
44
from urllib.parse import urljoin
55
from uuid import uuid5
66

@@ -31,14 +31,14 @@ def add_warc_direct_answer_parser(
3131
url_pattern_regex: str | None,
3232
priority: float | None,
3333
parser_type: WarcDirectAnswerParserType,
34-
xpaths: List[str] | None,
34+
xpath: str | None,
3535
url_xpath: str | None,
3636
text_xpath: str | None,
3737
) -> None:
3838
if priority is not None and priority <= 0:
3939
raise ValueError("Priority must be strictly positive.")
4040
if parser_type == "xpath":
41-
if xpaths is None:
41+
if xpath is None:
4242
raise ValueError("No XPath given.")
4343
else:
4444
raise ValueError(f"Invalid parser type: {parser_type}")
@@ -58,7 +58,7 @@ def add_warc_direct_answer_parser(
5858
url_pattern_regex=url_pattern_regex,
5959
priority=priority,
6060
parser_type=parser_type,
61-
xpaths=xpaths,
61+
xpath=xpath,
6262
url_xpath=url_xpath,
6363
text_xpath=text_xpath,
6464
)
@@ -79,56 +79,55 @@ def _parse_warc_direct_answer(
7979

8080
# Parse direct answer.
8181
if parser.parser_type == "xpath":
82-
if parser.xpaths is None:
82+
if parser.xpath is None:
8383
raise ValueError("No XPath given.")
8484
with open_warc(warc_store, warc_location) as record:
8585
tree = parse_xml_tree(record)
8686
if tree is None:
8787
return None
8888

89-
for xpath in parser.xpaths:
90-
elements = safe_xpath(tree, xpath, _Element)
91-
if len(elements) == 0:
92-
return None
89+
elements = safe_xpath(tree, parser.xpath, _Element)
90+
if len(elements) == 0:
91+
return None
9392

94-
direct_answers = []
95-
element: _Element
96-
for i, element in enumerate(elements):
97-
url: str | None = None
98-
if parser.url_xpath is not None:
99-
urls = safe_xpath(element, parser.url_xpath, str)
100-
if len(urls) > 0:
101-
url = urls[0].strip()
102-
url = urljoin(capture_url, url)
103-
text: str | None = None
104-
if parser.text_xpath is not None:
105-
texts = safe_xpath(element, parser.text_xpath, str)
106-
if len(texts) > 0:
107-
text = texts[0].strip()
93+
direct_answers = []
94+
element: _Element
95+
for i, element in enumerate(elements):
96+
url: str | None = None
97+
if parser.url_xpath is not None:
98+
urls = safe_xpath(element, parser.url_xpath, str)
99+
if len(urls) > 0:
100+
url = urls[0].strip()
101+
url = urljoin(capture_url, url)
102+
text: str | None = None
103+
if parser.text_xpath is not None:
104+
texts = safe_xpath(element, parser.text_xpath, str)
105+
if len(texts) > 0:
106+
text = texts[0].strip()
108107

109-
content: str = tostring(
110-
element,
111-
encoding=str,
112-
method="xml",
113-
pretty_print=False,
114-
with_tail=True,
115-
)
116-
direct_answer_id_components = (
117-
serp_id,
118-
parser.id,
119-
str(hash(content)),
120-
str(i),
121-
)
122-
direct_answer_id = str(uuid5(
123-
NAMESPACE_RESULT,
124-
":".join(direct_answer_id_components),
125-
))
126-
direct_answers.append(DirectAnswer(
127-
id=direct_answer_id,
128-
content=content,
129-
url=url,
130-
text=text,
131-
))
108+
content: str = tostring(
109+
element,
110+
encoding=str,
111+
method="xml",
112+
pretty_print=False,
113+
with_tail=True,
114+
)
115+
direct_answer_id_components = (
116+
serp_id,
117+
parser.id,
118+
str(hash(content)),
119+
str(i),
120+
)
121+
direct_answer_id = str(uuid5(
122+
NAMESPACE_RESULT,
123+
":".join(direct_answer_id_components),
124+
))
125+
direct_answers.append(DirectAnswer(
126+
id=direct_answer_id,
127+
content=content,
128+
url=url,
129+
text=text,
130+
))
132131
return direct_answers
133132
else:
134133
raise ValueError(f"Unknown parser type: {parser.parser_type}")

0 commit comments

Comments
 (0)