wiki_import/import_wikidata.py at master · DOsinga/wiki_import · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python

from collections import defaultdict

import argparse
import subprocess
import json

import psycopg2
from psycopg2 import extras


def setup_db(connection_string):
    conn = psycopg2.connect(connection_string)
    cursor = conn.cursor()
    cursor.execute('DROP TABLE IF EXISTS wikidata')
    cursor.execute(
        'CREATE TABLE wikidata ('
        '    wikipedia_id TEXT PRIMARY KEY,'
        '    title TEXT,'
        '    wikidata_id TEXT,'
        '    description TEXT,'
        '    properties JSONB'
        ')'
    )
    cursor.execute('CREATE INDEX wikidata_wikidata_id ON wikidata(wikidata_id)')
    cursor.execute('CREATE INDEX wikidata_properties ON wikidata USING gin(properties)')
    return conn, cursor


def parse_wikidata(lines):
    for line in lines:
        line = line.strip()
        if line and line[0] == '{':
            if line[-1] == ',':
                line = line[:-1]
            yield json.loads(line)


def map_value(value, id_name_map):
    if not value or not 'type' in value or not 'value' in value:
        return None
    typ = value['type']
    value = value['value']
    if typ == 'string':
        return value
    elif typ == 'wikibase-entityid':
        entitiy_id = value['id']
        return id_name_map.get(entitiy_id)
    elif typ == 'time':
        time_split = DATE_PARSE_RE.match(value['time'])
        if not time_split:
            return None
        year, month, day, hour, minute, second = map(int, time_split.groups())
        if day == 0:
            day = 1
        if month == 0:
            month = 1
        return '%04d-%02d-%02dT%02d:%02d:%02d' % (year, month, day, hour, minute, second)
    elif typ == 'quantity':
        return float(value['amount'])
    elif typ == 'monolingualtext':
        return value['text']
    elif typ == 'globecoordinate':
        lat = value.get('latitude')
        lng = value.get('longitude')
        if lat or lng:
            res = {'lat': lat, 'lng': lng}
            globe = value.get('globe', '').rsplit('/', 1)[-1]
            if globe != 'Q2' and globe in id_name_map:
                res['globe'] = globe
            if value.get('altitude'):
                res['altitude'] = value['altitude']
            return res

    return None


def main(dump, cursor):
    """We do two scans:
     - first collect the id -> name / wikipedia title
     - then store the actual objects with a json property.
     The first step takes quite a bit of memory (5Gb) - could possibly be done using a temporary table in postgres.
  """
    c = 0
    skip = 0
    id_name_map = {}
    for d in parse_wikidata(subprocess.Popen(['bzcat'], stdin=open(dump), stdout=subprocess.PIPE).stdout):
        c += 1
        if c % 1000 == 0:
            print(c, skip)
        if d.get('sitelinks') and d['sitelinks'].get('enwiki'):
            value = d['sitelinks']['enwiki']['title']
        elif d['labels'].get('en'):
            value = id_name_map[d['id']] = d['labels']['en']['value']
        else:
            skip += 1
            continue
        id_name_map[d['id']] = value

    wp_ids = set()
    c = 0
    rec = 0
    dupes = 0
    for d in parse_wikidata(subprocess.Popen(['bzcat'], stdin=open(dump), stdout=subprocess.PIPE).stdout):
        c += 1
        if c % 1000 == 0:
            print(c, rec, dupes)
        wikipedia_id = d.get('sitelinks', {}).get('enwiki', {}).get('title')
        title = d['labels'].get('en', {}).get('value')
        description = d['descriptions'].get('en', {}).get('value')
        wikidata_id = d['id']
        properties = {}
        if wikipedia_id and title:
            # There are some duplicate wikipedia_id's in there. We could make wikidata_id the primary key
            # but that doesn't fix the underlying dupe
            if wikipedia_id in wp_ids:
                dupes += 1
                continue
            wp_ids.add(wikipedia_id)
            # Properties are mapped in a way where we create lists as values for wiki entities if there is more
            # than one value. For other types, we always pick one value. If there is a preferred value, we'll
            # pick that one.
            # Mostly this does what you want. For filtering on colors for flags it alllows for the query:
            #   SELECT title FROM wikidata WHERE properties @> '{"color": ["Green", "Red", "White"]}'
            # However, if you'd want all flags that have Blue in them, you'd have to check for just "Blue"
            # and also ["Blue"].
            for prop_id, claims in d['claims'].items():
                prop_name = id_name_map.get(prop_id)
                if prop_name:
                    ranks = defaultdict(list)
                    for claim in claims:
                        mainsnak = claim.get('mainsnak')
                        if mainsnak:
                            data_value = map_value(mainsnak.get('datavalue'), id_name_map)
                            if data_value:
                                lst = ranks[claim['rank']]
                                if mainsnak['datavalue'].get('type') != 'wikibase-entityid':
                                    del lst[:]
                                lst.append(data_value)
                    for r in 'preferred', 'normal', 'depricated':
                        value = ranks[r]
                        if value:
                            if len(value) == 1:
                                value = value[0]
                            else:
                                value = sorted(value)
                            properties[prop_name] = value
                            break

            rec += 1
            cursor.execute(
                'INSERT INTO wikidata (wikipedia_id, title, wikidata_id, description, properties) VALUES (%s, %s, %s, %s, %s)',
                (wikipedia_id, title, wikidata_id, description, extras.Json(properties)),
            )


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Import wikidata into postgress')
    parser.add_argument('--postgres', type=str, help='postgres connection string')
    parser.add_argument('dump', type=str, help='BZipped wikidata dump')

    args = parser.parse_args()
    conn, cursor = setup_db(args.postgres)

    main(args.dump, cursor)

    conn.commit()