-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
92 lines (76 loc) · 2.85 KB
/
app.py
File metadata and controls
92 lines (76 loc) · 2.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from fastapi import FastAPI, Request, Form
from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates
from transformers import pipeline
import re
app = FastAPI()
ner = pipeline(
"ner", model="Davlan/xlm-roberta-base-ner-hrl", aggregation_strategy="simple"
)
templates = Jinja2Templates(directory="templates")
@app.get("/", response_class=HTMLResponse)
async def home(request: Request):
return templates.TemplateResponse(
"index.html",
{"request": request, "highlighted_text": "", "text": ""},
)
def build_highlight_spans_from_offsets(entities, label):
spans = []
for ent in entities:
spans.append({"start": ent["start"], "end": ent["end"], "label": label})
return spans
@app.post("/", response_class=HTMLResponse)
async def analyze(request: Request, text: str = Form(...)):
spans = []
ner_results = ner(text)
persons = [ent for ent in ner_results if ent["entity_group"] == "PER"]
spans += build_highlight_spans_from_offsets(persons, "person")
# Email
emails = list(re.finditer(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text))
spans += [{"start": m.start(), "end": m.end(), "label": "email"} for m in emails]
# Phone
phones = list(re.finditer(r"\+?\d[\d\s\-()]{7,}", text))
spans += [{"start": m.start(), "end": m.end(), "label": "phone"} for m in phones]
# Address
addresses = list(
re.finditer(
r"\d{1,5}\s\w+\s\w+|\b(street|road|ave|avenue|boulevard|rd|blvd)\b",
text,
flags=re.I,
)
)
spans += [
{"start": m.start(), "end": m.end(), "label": "address"} for m in addresses
]
# ID
ids = list(re.finditer(r"(order|customer|serial)[\s:#\-]*\w+", text, flags=re.I))
spans += [{"start": m.start(), "end": m.end(), "label": "id"} for m in ids]
# Credentials
credentials = list(
re.finditer(r"(username|password|login)[\s:=\-]*\w+", text, flags=re.I)
)
spans += [
{"start": m.start(), "end": m.end(), "label": "credentials"}
for m in credentials
]
# Warranty
warranties = list(
re.finditer(r"(warranty|ticket|reference)[\s:#\-]*\w+", text, flags=re.I)
)
spans += [
{"start": m.start(), "end": m.end(), "label": "warranty"} for m in warranties
]
# Sort and merge highlights
spans = sorted(spans, key=lambda x: x["start"])
highlighted_text = ""
last = 0
for span in spans:
if span["start"] >= last:
highlighted_text += text[last : span["start"]]
highlighted_text += f'<span class="{span["label"]}">{text[span["start"] : span["end"]]}</span>'
last = span["end"]
highlighted_text += text[last:]
return templates.TemplateResponse(
"index.html",
{"request": request, "highlighted_text": highlighted_text, "text": text},
)