Installing Tesseract
brew install tesseract --all-languages
Installing PyOCR
pip3 install pyocr
Installing Wand and PIL
brew install imagemagick@6
export MAGICK_HOME=/usr/local/opt/imagemagick@6
pip2 install wand
Warming up
from wand.image import Image
from PIL import Image as PI
import pyocr
import pyocr.builders
import io
Get Going
tool = pyocr.get_available_tools()[0] # tesseract
lang = tool.get_available_languages()[0] # check with tesseract to find out which index you need to use
req_image = []
final_text = []
image_pdf = Image(filename="./PDF_FILE_NAME", resolution=300)
image_jpeg = image_pdf.convert('jpeg')
for img in image_jpeg.sequence:
img_page = Image(image=img)
req_image.append(img_page.make_blob('jpeg'))
for img in req_image:
txt = tool.image_to_string(
PI.open(io.BytesIO(img)),
lang=lang,
builder=pyocr.builders.TextBuilder()
)
final_text.append(txt)