Source code for IQDMPDF.pdf_reader

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# pdf_reader.py
"""Read PDF files into python objects"""
#
# Copyright (c) 2020 Dan Cutright
# This file is part of IQDM-PDF, released under a MIT license.
#    See the file LICENSE included with this distribution
#
# Code adapted from Mark Amery's answer at:
# https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file
# Accessed August 8, 2019


from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
from io import StringIO
from IQDMPDF.utilities import (
    get_sorted_indices,
    is_in_tol,
    bbox_to_pos,
    is_numeric,
)

# Search tolerance for get_block_data
TOLERANCE = 10


[docs]def convert_pdf_to_txt(path): """Extract text from a PDF Parameters ---------- path : str Absolute file path to the PDF to be read Returns ---------- str The text content of the PDF """ rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) fp = open(path, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True, ): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
[docs]class CustomPDFReader: """Custom PDF Parsing module""" def __init__(self, file_path, laparams_kwargs=None): """Initialize a CustomPDFReader object Parameters ---------- file_path : str Absolute file path to the PDF to be read """ self.page = [] self.file_path = file_path self.laparams_kwargs = laparams_kwargs self.convert_pdf_to_text() self.data = [] def __str__(self): """Get str rep for each page of the PDF""" return "\n".join([str(page) for page in self.page]) def __repr__(self): """Return the str rep""" return self.__str__()
[docs] def get_block_data( self, page, pos, tol=TOLERANCE, text_cleaner=None, numeric=None, ignored=None, mode="bottom-left", ): """Use PDFPageParser.get_block_data for the provided page Parameters ---------- page : int The index of the PDF page pos : tuple of int, float The (x,y) coordinates of the text block to be retrieved tol : int, float, tuple Maximum distance a block's x or y-coordinate may be from pos. If a tuple is provided, first value is the x_tolerance, 2nd is y_tolerance text_cleaner : callable, optional A function called on each text element (e.g., remove leading ':') numeric : bool, optional If true, only return value if it is numeric. If false, only return value if it is not numeric. Leave as None to ignore this feature. ignored : list, optional Optionally provide a list of strings that should be ignored. If the value of the block data is in this list, the value will become an empty string instead mode : str, optional Options are combinations of top/center/bottom and right/center/left, e.g., 'top-right', 'center-right'. 'center' is assumed to be 'center-center'. Default is 'bottom-left'. Returns ---------- list of str All text data that meet the input constraints """ return self.page[page].get_block_data( pos, tol, text_cleaner=text_cleaner, numeric=numeric, ignored=ignored, mode=mode, )
[docs] def convert_pdf_to_text(self): """Extract text and coordinates from a PDF""" # Open a PDF file. fp = open(self.file_path, "rb") # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. # if not document.is_extractable: # raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. kwargs = {} if self.laparams_kwargs is None else self.laparams_kwargs laparams = LAParams(**kwargs) # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # loop over all pages in the document for p, page in enumerate(PDFPage.create_pages(document)): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object keys = ["bbox", "x", "y", "text"] page_data = {key: [] for key in keys} self.page.append( PDFPageParser(layout._objs, page_data, page_index=p) ) device.close() fp.close() parser.close()
[docs] def get_bbox_of_data(self, text, return_all=False, include_text=False): """Get the bounding box for a given string Parameters ---------- text : str Check all parsed data for this string. Return the first bounding box that contains this text. Meant to search for a unique str return_all : bool If true, then return a list containing all matches, in the order pdfminer.six found them include_text : bool If true, also return the text data Returns ---------- dict, list "page"->int and "bbox"->[x0, y0, x1, y1]. If include_data is true, "text"->str will contain the text data. If return_all is true, return a list of these dict objects. """ ans = [] for p, page in enumerate(self.page): for i, stored_text in enumerate(page.data["text"]): if text in stored_text: this_ans = {"page": p, "bbox": page.data["bbox"][i]} if include_text: this_ans["text"] = stored_text if not return_all: return this_ans ans.append(this_ans) return ans if ans else None
[docs]class PDFPageParser: """Custom PDF Page Parsing module""" def __init__(self, lt_objs, page_data, page_index=0): """Initialization of PDFPageParser Parameters ---------- lt_objs : list A layout object from PDFPageAggregator.get_result()._objs page_data : dict A dictionary of lists, with keys 'x', 'y', 'text' page_index : int, optional The index of the page """ self.lt_objs = lt_objs self.data = page_data self.page_index = page_index self.parse_obj(lt_objs) self.sort_all_data_by_y() self.sub_sort_all_data_by_x() def __str__(self): """Get the coordinates and text value for all text blocks""" ans = [] for index, text in enumerate(self.data["text"]): ans.append( "page_index: %s, data_index: %s\nbbox: %s\n%s" % (self.page_index, index, self.data["bbox"][index], text) ) return "\n".join(ans) def __repr__(self): """Return the str rep""" return self.__str__()
[docs] def parse_obj(self, lt_objs): """Extract x, y, and text data from a layout objects Parameters ---------- lt_objs : list A layout object from PDFPageAggregator.get_result()._objs """ # loop over the object list for obj in lt_objs: if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal): bbox = [round(i, 2) for i in obj.bbox] self.data["bbox"].append(bbox) self.data["x"].append(bbox[0]) self.data["y"].append(bbox[1]) self.data["text"].append(obj.get_text()) # if it's a container, recurse elif isinstance(obj, pdfminer.layout.LTFigure): self.parse_obj(obj._objs)
[docs] def sort_all_data_by_y(self): """Sort parsed data by y coordinate""" self.sort_all_data("y", reverse=True)
[docs] def sub_sort_all_data_by_x(self): """Sort each row of data by x-coordinate, keeping y order""" for y in set(self.data["y"]): indices, x, text = [], [], [] for i, y_ in enumerate(self.data["y"]): if y_ == y: indices.append(i) x.append(self.data["x"][i]) text.append(self.data["text"][i]) for sort_index, data_index in enumerate(get_sorted_indices(x)): self.data["x"][indices[sort_index]] = x[data_index] self.data["text"][indices[sort_index]] = text[data_index]
[docs] def sort_all_data(self, sort_key, reverse=False): """Sort all parsed data by sort_key Parameters ---------- sort_key : str Either 'x' or 'y' reverse : bool Passes into standard library sorted() function """ sorted_indices = get_sorted_indices( self.data[sort_key], reverse=reverse ) for key in list(self.data): self.data[key] = [self.data[key][i] for i in sorted_indices]
[docs] def get_block_data( self, pos, tol, text_cleaner=None, numeric=None, ignored=None, mode="bottom-left", ): """Get the text block data by x,y coordinates Parameters ---------- pos : list of int, float The (x,y) coordinates of the text block to be retrieved tol : int, float, tuple Maximum distance a block's x or y-coordinate may be from pos. If a tuple is provided, first value is the x_tolerance, 2nd is y_tolerance text_cleaner : callable, optional A function called on each text element (e.g., remove leading ':') numeric : bool, optional If true, only return value if it is numeric. If false, only return value if it is not numeric. Leave as None to ignore this feature. ignored : list, optional Optionally provide a list of strings that should be ignored. If the value of the block data is in this list, the value will become an empty string instead mode : str, optional Options are combinations of top/center/bottom and right/center/left, e.g., 'top-right', 'center-right'. 'center' is assumed to be 'center-center'. Default is 'bottom-left'. Returns ---------- list of str All text data that meet the input constraints """ tol = tol if isinstance(tol, tuple) else (tol, tol) block_data = [] for i, data in enumerate(self.data["text"]): data_pos = bbox_to_pos(self.data["bbox"][i], mode) valid_x = is_in_tol(data_pos[0], pos[0], tol[0]) valid_y = is_in_tol(data_pos[1], pos[1], tol[1]) if valid_x and valid_y: data_clean = ( data.strip() if text_cleaner is None else text_cleaner(data) ) if ignored is not None and data_clean in ignored: data_clean = "" if data_clean and numeric is not None: data_is_numeric = is_numeric(data_clean) if (numeric and not data_is_numeric) or ( not numeric and data_is_numeric ): data_clean = "" if data_clean: block_data.append(data_clean) return block_data