1+ import os
12from abc import ABC , abstractmethod
23from typing import Any , Dict , List
34
5+ import requests
6+
47
58class BaseReader (ABC ):
69 """
@@ -18,3 +21,45 @@ def read(self, file_path: str) -> List[Dict[str, Any]]:
1821 :param file_path: Path to the input file.
1922 :return: List of dictionaries containing the data.
2023 """
24+
25+ @staticmethod
26+ def filter (data : List [dict ]) -> List [dict ]:
27+ """
28+ Filter out entries with empty or missing text in the specified column.
29+
30+ :param data: List of dictionaries containing the data.
31+ :return: Filtered list of dictionaries.
32+ """
33+
34+ def _image_exists (path_or_url : str , timeout : int = 3 ) -> bool :
35+ """
36+ Check if an image exists at the given local path or URL.
37+ :param path_or_url: Local file path or remote URL of the image.
38+ :param timeout: Timeout for remote URL requests in seconds.
39+ :return: True if the image exists, False otherwise.
40+ """
41+ if not path_or_url :
42+ return False
43+
44+ if not path_or_url .startswith (("http://" , "https://" , "ftp://" )):
45+ path = path_or_url .replace ("file://" , "" , 1 )
46+ return os .path .isfile (path )
47+ try :
48+ resp = requests .head (path_or_url , allow_redirects = True , timeout = timeout )
49+ return resp .status_code == 200
50+ except requests .RequestException :
51+ return False
52+
53+ filtered_data = []
54+ for item in data :
55+ if item .get ("type" ) == "text" :
56+ content = item .get ("content" , "" ).strip ()
57+ if content :
58+ filtered_data .append (item )
59+ elif item .get ("type" ) in ("image" , "table" , "equation" ):
60+ img_path = item .get ("img_path" )
61+ if _image_exists (img_path ):
62+ filtered_data .append (item )
63+ else :
64+ filtered_data .append (item )
65+ return filtered_data
0 commit comments