forked from ranihorev/arxiv-network-graph
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_citations_and_references.py
72 lines (57 loc) · 2.59 KB
/
fetch_citations_and_references.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import logging
from datetime import datetime
from time import sleep
import pymongo
import requests
logger = logging.getLogger(__name__)
client = pymongo.MongoClient()
mdb = client.arxiv
db_papers = mdb.papers
sem_sch_papers = mdb.sem_sch_papers # semantic scholar data
sem_sch_authors = mdb.sem_sch_authors # semantic scholar data
def send_query(p, is_arxiv):
p_id = p['_id']
prefix = "arXiv:" if is_arxiv else ""
response = requests.get(f'https://api.semanticscholar.org/v1/paper/{prefix}{p_id}').json()
if 'error' in response:
logger.info(f'Error - {p_id} - {response}')
return None
authors = [{'id': a['authorId'], 'name': a['name']} for a in response['authors']]
citations = [{'arxivId': c['arxivId'], 'paperId': c['paperId'], 'title': c['title']} for c in response['citations']]
references = [{'arxivId': r['arxivId'], 'paperId': r['paperId'], 'title': r['title']} for r in
response['references']]
return {
'_id': response['arxivId'], 'paperId': response['paperId'], 'year': response['year'],
'time_updated': p.get('time_updated', None), 'time_published': p.get('time_published', None),
'title': response['title'], 'authors': authors, 'citations': citations, 'references': references,
'last_rec_update': datetime.utcnow(), 'found': 1
}
def fetch_paper_data(p, is_arxiv=True):
p_id = p['_id']
res = send_query(p, is_arxiv)
if not res:
return {'_id': p_id, 'title': p['title'], 'authors': p['authors'],
'last_rec_update': datetime.utcnow(), 'time_updated': p['time_updated'],
'time_published': p['time_published'],
'found': 0}
return res
def update_all_papers(age_days=5):
logger.info('Updating citations and references')
min_days_to_update = age_days * 86400
papers = list(db_papers.find())
logger.info(f'Fetching {len(papers)} documents')
for idx, p in enumerate(papers):
if idx % 500 == 0:
logger.info(f'Updating batch {idx}')
cur_sem_sch = sem_sch_papers.find_one({'_id': p['_id']})
if not cur_sem_sch or (datetime.utcnow() - cur_sem_sch['last_rec_update']).total_seconds() > min_days_to_update:
res = fetch_paper_data(p)
sem_sch_papers.update({'_id': res['_id']}, {'$set': res}, True)
for a in res['authors']:
sem_sch_authors.update({'_id': a['name']}, {}, True)
sleep(0.2)
else:
logger.debug('Paper is already in DB')
logger.info('Finished updating refs')
if __name__ == '__main__':
update_all_papers()