entity-extraction/index.py at master · AnthonySigogne/entity-extraction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
API to extract entities from a text.
The goal is to identify the most important "concepts" of a text : companies, brands, people, places...
Each concept, or entity, is associated with additional data like the link to the Wikipedia page, an abstract or a photo.
This API uses the library DBPedia Spotlight : https://github.com/dbpedia-spotlight/dbpedia-spotlight.
"""

__author__ = "Anthony Sigogne"
__copyright__ = "Copyright 2017, Byprog"
__email__ = "anthony@byprog.com"
__license__ = "MIT"
__version__ = "1.0"

# libraries of tool
import subprocess
import json
from SPARQLWrapper import SPARQLWrapper, JSON
from collections import Counter
from flask import Flask, request, jsonify, url_for, render_template
app = Flask(__name__)


@app.route("/entities", methods=['POST'])
def entity_extraction():
    """
    URL : /entities
    Extract entities (Company, Place,...) from a text.
    Method : POST
    Form data :
        - text : the text to analyze
        - language : language of text ("fr" or "en")
    Return a JSON dictionary with one key : {"entities":[list of entities]}
    """
    def query_dbpedia(entity, lang) :
        """
        Query DBPedia to get more data about entity.
        """
        # compute entity type from dbpedia types
        dbtype = u"Entity"
        if "Schema:Place" in entity["@types"] :
            dbtype = u"Place"
        elif "Schema:Person" in entity["@types"] :
            dbtype = u"Person"
        elif "DBpedia:Company" in entity["@types"] :
            dbtype = u"Company"

        # create a new entity structure
        entity = {"uri":entity["@URI"], "dbtype":dbtype}

        # query dbpedia for more information about entity
        sparql = SPARQLWrapper("http://%sdbpedia.org/sparql"%("fr." if lang == "fr" else ""))
        sparql.setQuery(u"""
            prefix db-owl: <http://dbpedia.org/ontology/>
            select * where
            {
             ?uri <http://dbpedia.org/ontology/wikiPageID> ?id.
             FILTER (?uri = <%s>)
             OPTIONAL {?uri db-owl:thumbnail ?photo}
             ?uri <http://www.w3.org/ns/prov#wasDerivedFrom> ?wiki.
             ?uri rdfs:label ?label.
             OPTIONAL{?uri db-owl:abstract ?abstract.
             FILTER (LANG(?abstract) = "%s").}
             FILTER (LANG(?label) = "%s")
            }
        """%(entity["uri"],lang,lang))
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        res = results["results"]["bindings"][0]
        entity.update({
            "label":res["label"]["value"],
            "wiki":res["wiki"]["value"],
            "photo":res["photo"]["value"].replace("width=300","width=50") if "photo" in res else None,
            "abstract":res["abstract"]["value"][:150]+"..." if "abstract" in res else None
        })

        return entity

    # get POST data
    data = dict((key, request.form.get(key)) for key in request.form.keys())

    # get entities with dbpedia spotlight tool
    if "text" not in data :
        raise InvalidUsage('No text specified in POST data')
    if "language" not in data :
        raise InvalidUsage('No language specified in POST data')
    if data["language"] not in ["fr", "en"] :
        raise InvalidUsage('Unsupported language')
    port = 2225 if data["language"] == "fr" else 2222
    confidence = 0.3 if data["language"] == "fr" else 0.5
    res = subprocess.check_output(["curl","http://spotlight.sztaki.hu:%d/rest/annotate"%port,'--data-urlencode',"text=%s"%data["text"].encode("utf8"),'--data',"confidence=%f"%confidence,'-H',"Accept: application/json"])

    # browse entities and get dbpedia data
    entities = []
    for entity in json.loads(res.decode("utf8")).get("Resources",[]) :
        if float(entity["@similarityScore"]) > 0.92 : # only entities with high similarity score
            entity = query_dbpedia(entity, data["language"])
            if entity not in entities :
                entities.append(entity)

    # return the final list of entities
    return jsonify(entities=entities)

@app.route("/")
def helper():
    """
    URL : /
    Helper that list all services of API.
    """
    # print module docstring
    output = [__doc__.replace("\n","<br/>"),]

    # then, get and print docstring of each rule
    for rule in app.url_map.iter_rules():
        if rule.endpoint == "static" : # skip static endpoint
            continue
        options = {}
        for arg in rule.arguments:
            options[arg] = "[{0}]".format(arg)
        methods = ','.join(rule.methods)
        output.append(app.view_functions[rule.endpoint].__doc__.replace("\n","<br/>"))

    return "<br/>".join(output)

class InvalidUsage(Exception):
    """
    Custom invalid usage exception.
    """
    status_code = 400

    def __init__(self, message, status_code=None, payload=None):
        Exception.__init__(self)
        self.message = message
        if status_code is not None:
            self.status_code = status_code
        self.payload = payload

    def to_dict(self):
        rv = dict(self.payload or ())
        rv['message'] = self.message
        return rv

@app.errorhandler(InvalidUsage)
def handle_invalid_usage(error):
    """
    JSON version of invalid usage exception
    """
    response = jsonify(error.to_dict())
    response.status_code = error.status_code
    return response