Generic_OCR/ocr_lib.py at master · MedPhysQC/Generic_OCR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# -*- coding: utf-8 -*-
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
from __future__ import print_function

# MODULE EXPECTS PYQTGRAPH DATA: X AND Y ARE TRANSPOSED!
"""
OCR using tessaract or cuniform through pyOCR.
pytesseract is also fine to use, and does not need zooming (but results are less), while pyocr needs zoom at least 6.0.
Take care to put the bounding box around only txt, and exclude other objects!

On Ubuntu 16.04: apt install python-pyocr tesseract-ocr tesseract-ocr-eng

Changelog:
    20200421: add border
    20180914: tesseract wants black text on white
    20180731: fix error ValueError: assignment destination is read-only for part[part<ocr_threshold] = 0
    20171117: sync with US; prepare for non-transposed data
    20171116: fix scipy version 1.0
"""
__version__ = '20200421'
__author__ = 'aschilham'

from PIL import Image
import numpy as np
from scipy import ndimage as scind
import pyocr
import re
import scipy.misc
# sanity check: we need at least scipy 0.10.1 to avoid problems mixing PIL and Pillow
scipy_version = [int(v) for v in scipy.__version__ .split('.')]
if scipy_version[0] == 0:
    if scipy_version[1]<10 or (scipy_version[1] == 10 and scipy_version[1]<1):
        raise RuntimeError("scipy version too old. Upgrade scipy to at least 0.10.1")

def getOCRTool():
    # check if OCR tools tesseract or cuneiform are available
    tools = pyocr.get_available_tools()
    if len(tools) == 0:
        raise RuntimeError("ERROR No OCR tool found")
    tool = tools[0]
    print("[ocr_lib] Using %s for OCR" % (tool.get_name()))
    return tool


def txt2type(txt, type, prefix='',suffix=''):
    """
    If prefix is defined, the length of the string is used to skip the first num characters.
    If suffix is defined, the length of the string is used to skip the last num characters.
    """
    if not type in ['string', 'bool', 'float']:
        raise ValueError('Unknown type %s'%type)

    txt = txt.strip() # removes '\n'
    txt = txt[len(prefix):]
    if len(suffix)>0:
        txt = txt[:-len(suffix)]
    txt = txt.strip()

    if len(txt) == 0:
        raise ValueError('[ocr_lib] ERROR! empty text line!')

    if type.lower() == 'string':
        return txt

    if type.lower() == 'bool':
        return (txt.lower() in ['1', 'true', 'y', 'yes'])

    # strip non-numeric characters from floating number
    fixfloat = {
        # some frequent problems
        '0':['O', 'o'],
        '1': ['l'],
        '2': ['Z', 'z'],
        '4': ['A'],
        '5': ['S', 's'],
        '7': ['/'],
        '8': ['B'],
        '9': ['Q', 'g'],
    }
    if type.lower() == 'float':
        #print("{}".format(txt))
        # first strip % and spaces and turn comma into period (without warning)
        txt = txt.replace('%','').replace(' ','').replace(',', '.')
        for fv,svs in fixfloat.items():
            for sv in svs:
                txt = txt.replace(sv, fv)

        # next the other characters
        newtxt = re.sub(r'[^\d.]+', '', txt)
        if newtxt != txt:
            print(u"[ocr_lib] Warning: replaced value {} by {}".format(txt, newtxt).encode('utf8'))
            txt = newtxt

        #print("{}".format(txt))
        return float(txt)


def OCR(pixeldata, xywh, zpos=0, ocr_zoom=10, ocr_threshold=0, ocr_border=0, transposed=True):
    """
    Use pyOCR which for OCR
    ul = upperleft pixel location [x,y]
    ocr_zoom = factor to enlarge image (15)
    ocr_threshold = remove values below this threshold (after inversion)
    transposed = pixeldata is transposed (old format)
    """
    tool = getOCRTool()

    # slice-out the relevant part of the image
    x,y,width,height = xywh

    if transposed:  # input was pyqtgraph-like
        if len(np.shape(pixeldata)) == 3:
            pixeldata = np.transpose(pixeldata,(0,2,1))
        else:
            pixeldata = np.transpose(pixeldata)

    if len(np.shape(pixeldata)) == 3:
        part = np.array(pixeldata[zpos][y:y+height, x:x+width])
    elif len(np.shape(pixeldata)) == 2:
        part = np.array(pixeldata[y:y+height, x:x+width])
    else:
        raise ValueError('[ocr_lib] Unknown dataformat of %d dimensions'%len(np.shape(pixeldata)))

    # heuristic contrast enhancement: want white is txt, background = 0
    # invert if needed
    edgeval = (np.mean(part[0,:])+np.mean(part[-1,:]))/2
    if edgeval > 128:
        part = edgeval-part
        part[part<0] = 0

    # remove noise/gradient
    part[part<ocr_threshold] = 0

    # enhance contrast
    minval = np.min(part)
    maxval = np.max(part)
    if (maxval-minval)<128:
        part = (part-minval)*(255/(maxval-minval))

    if ocr_zoom is None:
        # enlarge to prevent OCR mismatches; below 20px font height accuracy drops off
        minheight = 200 # this value to prevent pixGenHalftoneMask errors
        minwidth = 600 # this value to prevent pixGenHalftoneMask errors
        if height<minheight or width<minwidth:
            minzoom = int(max([minheight/height,minwidth/width]))+1
            ocr_zoom = minzoom

    if not ocr_zoom is None:
        part = np.round(scind.interpolation.zoom(part, zoom=(ocr_zoom,ocr_zoom),order=1))

    # extract numbers/text from bounding box
    ##import pytesseract
    ##txt = pytesseract.image_to_string(Image.fromarray(part))

    # 20180914: actually tesseract wants black text on white, so invert!
    maxval = np.max(part)
    part = maxval-part

    # 20200421: add border
    if ocr_border>0:
        dimy,dimx = np.shape(part)
        part2 = np.full((dimy+2*ocr_border, dimx+2*ocr_border), 255, dtype=part.dtype)
        part2[ocr_border:dimy+ocr_border,ocr_border:dimx+ocr_border] = part
        part = part2

    txt = tool.image_to_string(Image.fromarray(part))
    return txt, part