1
1
from functools import cache
2
2
from itertools import chain
3
- from typing import Iterable , Iterator , List
3
+ from typing import Iterable , Iterator
4
4
from urllib .parse import urljoin
5
5
from uuid import uuid5
6
6
@@ -31,14 +31,14 @@ def add_warc_direct_answer_parser(
31
31
url_pattern_regex : str | None ,
32
32
priority : float | None ,
33
33
parser_type : WarcDirectAnswerParserType ,
34
- xpaths : List [ str ] | None ,
34
+ xpath : str | None ,
35
35
url_xpath : str | None ,
36
36
text_xpath : str | None ,
37
37
) -> None :
38
38
if priority is not None and priority <= 0 :
39
39
raise ValueError ("Priority must be strictly positive." )
40
40
if parser_type == "xpath" :
41
- if xpaths is None :
41
+ if xpath is None :
42
42
raise ValueError ("No XPath given." )
43
43
else :
44
44
raise ValueError (f"Invalid parser type: { parser_type } " )
@@ -58,7 +58,7 @@ def add_warc_direct_answer_parser(
58
58
url_pattern_regex = url_pattern_regex ,
59
59
priority = priority ,
60
60
parser_type = parser_type ,
61
- xpaths = xpaths ,
61
+ xpath = xpath ,
62
62
url_xpath = url_xpath ,
63
63
text_xpath = text_xpath ,
64
64
)
@@ -79,56 +79,55 @@ def _parse_warc_direct_answer(
79
79
80
80
# Parse direct answer.
81
81
if parser .parser_type == "xpath" :
82
- if parser .xpaths is None :
82
+ if parser .xpath is None :
83
83
raise ValueError ("No XPath given." )
84
84
with open_warc (warc_store , warc_location ) as record :
85
85
tree = parse_xml_tree (record )
86
86
if tree is None :
87
87
return None
88
88
89
- for xpath in parser .xpaths :
90
- elements = safe_xpath (tree , xpath , _Element )
91
- if len (elements ) == 0 :
92
- return None
89
+ elements = safe_xpath (tree , parser .xpath , _Element )
90
+ if len (elements ) == 0 :
91
+ return None
93
92
94
- direct_answers = []
95
- element : _Element
96
- for i , element in enumerate (elements ):
97
- url : str | None = None
98
- if parser .url_xpath is not None :
99
- urls = safe_xpath (element , parser .url_xpath , str )
100
- if len (urls ) > 0 :
101
- url = urls [0 ].strip ()
102
- url = urljoin (capture_url , url )
103
- text : str | None = None
104
- if parser .text_xpath is not None :
105
- texts = safe_xpath (element , parser .text_xpath , str )
106
- if len (texts ) > 0 :
107
- text = texts [0 ].strip ()
93
+ direct_answers = []
94
+ element : _Element
95
+ for i , element in enumerate (elements ):
96
+ url : str | None = None
97
+ if parser .url_xpath is not None :
98
+ urls = safe_xpath (element , parser .url_xpath , str )
99
+ if len (urls ) > 0 :
100
+ url = urls [0 ].strip ()
101
+ url = urljoin (capture_url , url )
102
+ text : str | None = None
103
+ if parser .text_xpath is not None :
104
+ texts = safe_xpath (element , parser .text_xpath , str )
105
+ if len (texts ) > 0 :
106
+ text = texts [0 ].strip ()
108
107
109
- content : str = tostring (
110
- element ,
111
- encoding = str ,
112
- method = "xml" ,
113
- pretty_print = False ,
114
- with_tail = True ,
115
- )
116
- direct_answer_id_components = (
117
- serp_id ,
118
- parser .id ,
119
- str (hash (content )),
120
- str (i ),
121
- )
122
- direct_answer_id = str (uuid5 (
123
- NAMESPACE_RESULT ,
124
- ":" .join (direct_answer_id_components ),
125
- ))
126
- direct_answers .append (DirectAnswer (
127
- id = direct_answer_id ,
128
- content = content ,
129
- url = url ,
130
- text = text ,
131
- ))
108
+ content : str = tostring (
109
+ element ,
110
+ encoding = str ,
111
+ method = "xml" ,
112
+ pretty_print = False ,
113
+ with_tail = True ,
114
+ )
115
+ direct_answer_id_components = (
116
+ serp_id ,
117
+ parser .id ,
118
+ str (hash (content )),
119
+ str (i ),
120
+ )
121
+ direct_answer_id = str (uuid5 (
122
+ NAMESPACE_RESULT ,
123
+ ":" .join (direct_answer_id_components ),
124
+ ))
125
+ direct_answers .append (DirectAnswer (
126
+ id = direct_answer_id ,
127
+ content = content ,
128
+ url = url ,
129
+ text = text ,
130
+ ))
132
131
return direct_answers
133
132
else :
134
133
raise ValueError (f"Unknown parser type: { parser .parser_type } " )
0 commit comments