Extract text from Japanese shopping receipt (OCR)

Author: Iqbal

In [13]:
import pytesseract
from PIL import Image
import re


img = "test.jpg"   # click to view image

res = pytesseract.image_to_string(Image.open(img), lang="jpn")
res = res.split("\n")

time = ""
total = ""
given = ""
ret = ""
for l in res:
    if "年" in l and "月" in l and "日" in l:
        time = l
    
    if "合財" in l or "合 財" in l:
        total = l.replace(" ", "").replace("\\", "")
        total = re.findall(r'-?\d+\.?\d*', total)
        total = "".join(total)
    
    if "お預り" in l or "お 預り" in l or "お 預 り" in l or "お預 り" in l:
        given = l.replace(" ", "").replace("\\", "")
        given = re.findall(r'-?\d+\.?\d*', given)
        given = "".join(given)
    
    if "お 釣" in l or "お釣" in l:
        ret = l.replace(" ", "").replace("\\", "")
        ret = re.findall(r'-?\d+\.?\d*', ret)
        ret = "".join(total)

print({"time":time, "total": total, "given by customer":given, "return to customer":ret})
{'time': '2018年 3月 1日 17:35', 'total': '4.698', 'given by customer': '5000', 'return to customer': '302'}