Skip to content

Commit 7bd44ca

Browse files
committed
save downloaded URLs to history file, check for duplicates
1 parent 32ce19d commit 7bd44ca

File tree

2 files changed

+50
-21
lines changed

2 files changed

+50
-21
lines changed

.vscode/launch.json

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,28 @@
3737
"args": ["-v", "debug", "login"]
3838
},
3939
{
40-
"name": "dl_docs",
40+
"name": "dl_docs debug",
4141
"type": "python",
4242
"request": "launch",
4343
"module": "pytr",
4444
"args": ["-v", "debug", "dl_docs", "dldocs_debug"]
4545
},
4646
{
47-
"name": "dl_docs 2 days",
47+
"name": "dl_docs",
48+
"type": "python",
49+
"request": "launch",
50+
"module": "pytr",
51+
"args": ["dl_docs", "dldocs"]
52+
},
53+
{
54+
"name": "dl_docs 20 days",
55+
"type": "python",
56+
"request": "launch",
57+
"module": "pytr",
58+
"args": ["dl_docs", "dldocs", "--last_days", "20"]
59+
},
60+
{
61+
"name": "dl_docs 2 days debug",
4862
"type": "python",
4963
"request": "launch",
5064
"module": "pytr",

pytr/dl.py

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111

1212
class DL:
13-
def __init__(self, tr, output_path, filename_fmt, since_timestamp=0):
13+
def __init__(self, tr, output_path, filename_fmt, since_timestamp=0, history_file='pytr_history'):
1414
'''
1515
tr: api object
1616
output_path: name of the directory where the downloaded files are saved
@@ -19,6 +19,7 @@ def __init__(self, tr, output_path, filename_fmt, since_timestamp=0):
1919
'''
2020
self.tr = tr
2121
self.output_path = Path(output_path)
22+
self.history_file = self.output_path / history_file
2223
self.filename_fmt = filename_fmt
2324
self.since_timestamp = since_timestamp
2425

@@ -29,8 +30,20 @@ def __init__(self, tr, output_path, filename_fmt, since_timestamp=0):
2930
self.done = 0
3031
self.filepaths = []
3132
self.doc_urls = []
33+
self.doc_urls_history = []
3234
self.tl = Timeline(self.tr)
3335
self.log = get_logger(__name__)
36+
self.load_history()
37+
38+
def load_history(self):
39+
if self.history_file.exists():
40+
with self.history_file.open() as f:
41+
self.doc_urls_history = f.read().splitlines()
42+
self.log.info(f'Found {len(self.doc_urls_history)} lines in history file')
43+
else:
44+
self.history_file.parent.mkdir(exist_ok=True, parents=True)
45+
self.history_file.touch()
46+
self.log.info('Created history file')
3447

3548
async def dl_loop(self):
3649
await self.tl.get_next_timeline(max_age_timestamp=self.since_timestamp)
@@ -102,12 +115,17 @@ def dl_doc(self, doc, titleText, subtitleText, subfolder=None):
102115
if doc_url_base in self.doc_urls:
103116
self.log.debug(f'URL {doc_url_base} already in queue. Skipping...')
104117
return
118+
elif doc_url_base in self.doc_urls_history:
119+
self.log.debug(f'URL {doc_url_base} already in history. Skipping...')
120+
return
105121
else:
106122
self.doc_urls.append(doc_url_base)
107123

108124
future = self.session.get(doc_url)
109125
future.filepath = filepath
126+
future.doc_url_base = doc_url_base
110127
self.futures.append(future)
128+
self.log.debug(f'Added {filepath} to queue')
111129
else:
112130
self.log.debug(f'file {filepath} already exists. Skipping...')
113131

@@ -119,24 +137,21 @@ def work_responses(self):
119137
self.log.info('Nothing to download')
120138
exit(0)
121139

122-
self.log.info('Waiting for downloads to complete..')
123-
for future in as_completed(self.futures):
124-
if future.filepath.is_file() is True:
125-
self.log.debug(f'file {future.filepath} was already downloaded.')
140+
with self.history_file.open('a') as history_file:
141+
self.log.info('Waiting for downloads to complete..')
142+
for future in as_completed(self.futures):
143+
if future.filepath.is_file() is True:
144+
self.log.debug(f'file {future.filepath} was already downloaded.')
126145

127-
r = future.result()
128-
future.filepath.parent.mkdir(parents=True, exist_ok=True)
129-
with open(future.filepath, 'wb') as f:
130-
f.write(r.content)
131-
self.done += 1
146+
r = future.result()
147+
future.filepath.parent.mkdir(parents=True, exist_ok=True)
148+
with open(future.filepath, 'wb') as f:
149+
f.write(r.content)
150+
self.done += 1
151+
history_file.write(f'{future.doc_url_base}\n')
132152

133-
self.log.debug(f'{self.done:>3}/{len(self.doc_urls)} {future.filepath.name}')
153+
self.log.debug(f'{self.done:>3}/{len(self.doc_urls)} {future.filepath.name}')
134154

135-
if self.done == len(self.doc_urls):
136-
self.log.info('Done.')
137-
exit(0)
138-
139-
def dl_all(output_path):
140-
'''
141-
TODO
142-
'''
155+
if self.done == len(self.doc_urls):
156+
self.log.info('Done.')
157+
exit(0)

0 commit comments

Comments
 (0)