Source code for IQDMPDF.parsers.generic

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# generic.py
"""Generic IMRT QA report parser"""
#
# Copyright (c) 2021 Dan Cutright
# This file is part of IQDM-PDF, released under a MIT license.
#    See the file LICENSE included with this distribution

from IQDMPDF.utilities import are_all_strings_in_text
from IQDMPDF.pdf_reader import CustomPDFReader, convert_pdf_to_txt
import json


[docs]class ParserBase: """Base class for all Report Parser classes, not to be used alone""" def __init__(self): """Initialize columns and identifiers""" self.columns = [] self.identifiers = [] def __call__(self, file_path): """"Save file path and text""" self.file_path = file_path self.text = convert_pdf_to_txt(file_path).split("\n")
[docs] def is_text_data_valid(self, text): """Check that all identifiers are in text Parameters ---------- text : str Output from pdf_reader.convert_pdf_to_txt Returns ---------- bool True if and only if all identifiers are found in text """ return are_all_strings_in_text(text, self.identifiers)
@property def csv_data(self): """Get a CSV data of summary_data for all columns for csv.writer Returns ---------- list summary data as a list in order of columns. File path automatically appended to data """ return [ str(self.summary_data[c]).replace("\n", "<>") for c in self.columns ]
[docs]class GenericReport(ParserBase): """Generic IMRT QA PDF report parser based on page, x, y values""" def __init__(self, json_file_path, text_cleaner=None): """Initialization of a GenericReport class Parameters ---------- json_file_path : str File path to a JSON file describing the PDF report. It should contain these keys (type): report_type (str), identifiers (list of str), and data (list). The format of each data element should be {'column': [str], 'page': [int], 'pos': [float, float]}. Optionally, you can also supply 'tol', which is either an integer or a list of integers (i.e., [x_tol, y_tol]). Also, specifying 'numeric' with a boolean value will ensure the value is or is not numeric (and return an empty string if not met). The JSON object can also have "alternates" which contains an array of data like items that will be checked until a value for a column is found. "ignored" is another option, if a value is returned that is in this array, an empty string will be returned instead. The value of "column" is automatically added to the "ignored" array. text_cleaner : callable, optional A function called on each text element (e.g., remove leading ':') """ ParserBase.__init__(self) with open(json_file_path, "r") as f: self.json_data = json.load(f) self.report_type = self.json_data["report_type"] self.identifiers = self.json_data["identifiers"] self.columns = [el["column"] for el in self.json_data["data"]] self.LUT = { el["column"]: { key: value for key, value in el.items() if key != "column" } for el in self.json_data["data"] } self._process_ignored_from_json(self.LUT) self.text_cleaner = text_cleaner self.missing_columns = [] def _process_ignored_from_json(self, LUT): """Add column to ignored key in each json_data data item Parameters ---------- LUT : dict Processed dictionary from self.json_data["data"] or self.json_data["alternates"] where keys are values from "column" """ for column, data in LUT.items(): self._assign_ignored(column, data) @staticmethod def _assign_ignored(column, data): """Ensure data has ignored key, ensure column is in data['ignored']""" if "ignored" in data.keys(): if column not in data["ignored"]: data["ignored"].append(column) else: data["ignored"] = [column] def __call__(self, report_file_path): """Process an IMRT QA report PDF Parameters ---------- report_file_path : str File path pointing to an IMRT QA report """ super().__call__(report_file_path) self.data = CustomPDFReader(report_file_path) @property def summary_data(self): """A summary of data from the QA report Returns ---------- dict Keys will match "column" elements from the JSON file. Values are of type str """ data = { c: self.data.get_block_data( **self.LUT[c], text_cleaner=self.text_cleaner ) for c in self.columns } self._process_block_data(data) self._apply_alternates(data) return data def _process_block_data(self, data): """Get the first item in data blocks, update missing_columns Parameters ---------- data : dict Has values of the return from CustomPDFReader.get_block_data """ for key in list(data): data[key] = self._process_data_element(data[key]) if not data[key]: self.missing_columns.append(key) def _process_data_element(self, data_block): """Process data element from a data block Parameters ---------- data_block : a return from ``get_block_data`` Returns ------- str Returns the first element of a data block, unless it is equal to a column name. Returns an empty string when nothing found. """ if len(data_block) and data_block[0] not in self.columns: return data_block[0] return "" def _apply_alternates(self, data): """Check json_data["alternates"] for alternate instructions. Unlike json_data["data"], multiple instances of a column can exist in alternates. The code will check each item in alternate until a value is found for that column (if a value isn't already found). Parameters ---------- data : dict Data from CustomPDFReader.get_block_data Returns ------- dict data edited by alternates if value is an empty string """ if "alternates" in self.json_data: for alternate in self.json_data["alternates"]: key = alternate["column"] alt = {k: v for k, v in alternate.items() if k != "column"} self._assign_ignored(key, alt) alt["ignored"].extend(self.LUT[key]) if key in self.missing_columns: data[key] = self.data.get_block_data( **alt, text_cleaner=self.text_cleaner ) data[key] = self._process_data_element(data[key]) if data[key] != "": self.missing_columns.pop( self.missing_columns.index(key) )