-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathocr_lib.py
More file actions
178 lines (150 loc) · 6.21 KB
/
ocr_lib.py
File metadata and controls
178 lines (150 loc) · 6.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# -*- coding: utf-8 -*-
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import print_function
# MODULE EXPECTS PYQTGRAPH DATA: X AND Y ARE TRANSPOSED!
"""
OCR using tessaract or cuniform through pyOCR.
pytesseract is also fine to use, and does not need zooming (but results are less), while pyocr needs zoom at least 6.0.
Take care to put the bounding box around only txt, and exclude other objects!
On Ubuntu 16.04: apt install python-pyocr tesseract-ocr tesseract-ocr-eng
Changelog:
20200421: add border
20180914: tesseract wants black text on white
20180731: fix error ValueError: assignment destination is read-only for part[part<ocr_threshold] = 0
20171117: sync with US; prepare for non-transposed data
20171116: fix scipy version 1.0
"""
__version__ = '20200421'
__author__ = 'aschilham'
from PIL import Image
import numpy as np
from scipy import ndimage as scind
import pyocr
import re
import scipy.misc
# sanity check: we need at least scipy 0.10.1 to avoid problems mixing PIL and Pillow
scipy_version = [int(v) for v in scipy.__version__ .split('.')]
if scipy_version[0] == 0:
if scipy_version[1]<10 or (scipy_version[1] == 10 and scipy_version[1]<1):
raise RuntimeError("scipy version too old. Upgrade scipy to at least 0.10.1")
def getOCRTool():
# check if OCR tools tesseract or cuneiform are available
tools = pyocr.get_available_tools()
if len(tools) == 0:
raise RuntimeError("ERROR No OCR tool found")
tool = tools[0]
print("[ocr_lib] Using %s for OCR" % (tool.get_name()))
return tool
def txt2type(txt, type, prefix='',suffix=''):
"""
If prefix is defined, the length of the string is used to skip the first num characters.
If suffix is defined, the length of the string is used to skip the last num characters.
"""
if not type in ['string', 'bool', 'float']:
raise ValueError('Unknown type %s'%type)
txt = txt.strip() # removes '\n'
txt = txt[len(prefix):]
if len(suffix)>0:
txt = txt[:-len(suffix)]
txt = txt.strip()
if len(txt) == 0:
raise ValueError('[ocr_lib] ERROR! empty text line!')
if type.lower() == 'string':
return txt
if type.lower() == 'bool':
return (txt.lower() in ['1', 'true', 'y', 'yes'])
# strip non-numeric characters from floating number
fixfloat = {
# some frequent problems
'0':['O', 'o'],
'1': ['l'],
'2': ['Z', 'z'],
'4': ['A'],
'5': ['S', 's'],
'7': ['/'],
'8': ['B'],
'9': ['Q', 'g'],
}
if type.lower() == 'float':
#print("{}".format(txt))
# first strip % and spaces and turn comma into period (without warning)
txt = txt.replace('%','').replace(' ','').replace(',', '.')
for fv,svs in fixfloat.items():
for sv in svs:
txt = txt.replace(sv, fv)
# next the other characters
newtxt = re.sub(r'[^\d.]+', '', txt)
if newtxt != txt:
print(u"[ocr_lib] Warning: replaced value {} by {}".format(txt, newtxt).encode('utf8'))
txt = newtxt
#print("{}".format(txt))
return float(txt)
def OCR(pixeldata, xywh, zpos=0, ocr_zoom=10, ocr_threshold=0, ocr_border=0, transposed=True):
"""
Use pyOCR which for OCR
ul = upperleft pixel location [x,y]
ocr_zoom = factor to enlarge image (15)
ocr_threshold = remove values below this threshold (after inversion)
transposed = pixeldata is transposed (old format)
"""
tool = getOCRTool()
# slice-out the relevant part of the image
x,y,width,height = xywh
if transposed: # input was pyqtgraph-like
if len(np.shape(pixeldata)) == 3:
pixeldata = np.transpose(pixeldata,(0,2,1))
else:
pixeldata = np.transpose(pixeldata)
if len(np.shape(pixeldata)) == 3:
part = np.array(pixeldata[zpos][y:y+height, x:x+width])
elif len(np.shape(pixeldata)) == 2:
part = np.array(pixeldata[y:y+height, x:x+width])
else:
raise ValueError('[ocr_lib] Unknown dataformat of %d dimensions'%len(np.shape(pixeldata)))
# heuristic contrast enhancement: want white is txt, background = 0
# invert if needed
edgeval = (np.mean(part[0,:])+np.mean(part[-1,:]))/2
if edgeval > 128:
part = edgeval-part
part[part<0] = 0
# remove noise/gradient
part[part<ocr_threshold] = 0
# enhance contrast
minval = np.min(part)
maxval = np.max(part)
if (maxval-minval)<128:
part = (part-minval)*(255/(maxval-minval))
if ocr_zoom is None:
# enlarge to prevent OCR mismatches; below 20px font height accuracy drops off
minheight = 200 # this value to prevent pixGenHalftoneMask errors
minwidth = 600 # this value to prevent pixGenHalftoneMask errors
if height<minheight or width<minwidth:
minzoom = int(max([minheight/height,minwidth/width]))+1
ocr_zoom = minzoom
if not ocr_zoom is None:
part = np.round(scind.interpolation.zoom(part, zoom=(ocr_zoom,ocr_zoom),order=1))
# extract numbers/text from bounding box
##import pytesseract
##txt = pytesseract.image_to_string(Image.fromarray(part))
# 20180914: actually tesseract wants black text on white, so invert!
maxval = np.max(part)
part = maxval-part
# 20200421: add border
if ocr_border>0:
dimy,dimx = np.shape(part)
part2 = np.full((dimy+2*ocr_border, dimx+2*ocr_border), 255, dtype=part.dtype)
part2[ocr_border:dimy+ocr_border,ocr_border:dimx+ocr_border] = part
part = part2
txt = tool.image_to_string(Image.fromarray(part))
return txt, part