Source code for IQDMPDF.file_processor

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# file_processor.py
"""Process IMRT QA file(s) into CSV file(s)"""
#
# Copyright (c) 2021 Dan Cutright
# This file is part of IQDM-PDF, released under a MIT license.
#    See the file LICENSE included with this distribution

from datetime import datetime
from os.path import isfile, join
import csv
from IQDMPDF.parsers.parser import ReportParser
from IQDMPDF.utilities import get_files, run_multiprocessing
from IQDMPDF._version import __version__


[docs]def process_files(
    init_directory,
    ignore_extension=False,
    output_file=None,
    output_dir=None,
    no_recursive_search=False,
    callback=None,
    raise_errors=False,
    processes=1,
):
    """Process all pdf files into parser classes, write data to csv

    Parameters
    ----------
    init_directory : str
        initial scanning directory
    ignore_extension : bool, optional
        Set to True to catch pdf files that are missing .pdf extension
    output_file : str, optional
       Report type in file name will be prepended to this value
    output_dir : str, optional
        Save results to this directory, default is local directory
    no_recursive_search : bool, optional
        Ignore sub-directories it True
    callback : callable
        Pointer to a function to be called before each process_file call. The
        parameter will be dict with keys of "label" and "gauge".
    raise_errors : bool
        Set to True to allow errors to be raised (useful for debugging)
    processes : int
        Number of parallel processes allowed
    """

    time_stamp = str(datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    if output_file is None:
        output_file = "results_%s.csv" % time_stamp

    extension = None if ignore_extension else ".pdf"
    search_sub_dir = not no_recursive_search
    files = get_files(init_directory, search_sub_dir, extension)

    if processes == 1:
        for i, file in enumerate(files):
            if callback is not None:
                label = "Processing (%s of %s): %s" % (i + 1, len(files), file)
                gauge = float(i) / float(len(files))
                callback({"label": label, "gauge": gauge})
            try:
                process_file(file, output_file, output_dir)
            except Exception as e:
                if raise_errors:
                    raise e
                else:
                    print(str(e))
    else:
        # Multiprocessing
        print("Processing %s file(s) ..." % len(files))
        all_data = run_multiprocessing(
            process_file_worker, files, processes, callback=callback
        )

        print("Writing results to file(s) ...")

        # remove failed parsing
        all_data = [row for row in all_data if row["report_type"] is not None]

        report_types = list(set([row["report_type"] for row in all_data]))

        sorted_data = {key: [] for key in report_types}
        columns = {}
        for row in all_data:
            report_type = row["report_type"]
            sorted_data[report_type].append(row["data"])
            if report_type not in columns.keys():
                columns[report_type] = row["columns"]

        for report_type, data in sorted_data.items():
            current_file = "%s_%s" % (report_type, output_file)
            if output_dir is not None:
                current_file = join(output_dir, current_file)

            output = [columns[report_type]]
            output.extend(data)
            write_csv(current_file, output)

            print("%s data written to %s" % (report_type, current_file))


[docs]def process_file_worker(file_path):
    """Mutliprocessing worker function

    Parameters
    ----------
    file_path : str
        PDF file to be passed to ReportParser

    Returns
    -------
    dict
        {"data": ReportParser.csv_data, "report_type": ReportParser.report_type,
        "columns": ReportParser.columns}
    """
    data, report_type, columns = None, None, None
    try:
        parser = ReportParser(file_path)
        if parser.report is not None:
            data = parser.csv_data
            report_type = parser.report_type
            columns = parser.columns
    except Exception:
        pass
    return {"data": data, "report_type": report_type, "columns": columns}


[docs]def process_file(file_path, output_file, output_dir=None):
    """Process a pdf file into a parser class, write data to csv

    Parameters
    ----------
    file_path : str
        PDF file to processed
    output_file : str
       Report type in file name will be prepended to this value
    output_dir : str, optional
        Save results to this directory, default is local directory
    """
    parser = ReportParser(file_path)
    if parser.report is not None:
        row = parser.csv_data
        current_file = "%s_%s" % (
            parser.report_type,
            output_file,
        )  # prepend report type to file name
        if output_dir is not None:
            current_file = join(output_dir, current_file)
        if row:
            # if file doesn't exist, need to write columns
            if not isfile(current_file):
                write_csv(current_file, [parser.columns])
            # write the processed data
            write_csv(current_file, [row], mode="a")
    else:
        print("Skipping: %s" % file_path)


[docs]def write_csv(file_path, rows, mode="w", newline=""):
    """Create csv.writer, call writerows(rows)

    Parameters
    ----------
    file_path : str
        path to file
    rows : list, iterable
        Items to be written to file_pointer (input for csv.writer.writerows)
    mode : str
        optional string that specifies the mode in which the file is opened
    newline : str
        controls how universal newlines mode works.
        It can be None, '', '\n', '\r', and '\r\n'
    """
    with open(file_path, mode, encoding="utf-8", newline=newline) as f:
        writer = csv.writer(
            f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
        )
        writer.writerows(rows)


[docs]def validate_kwargs(kwargs, add_print_callback=True):
    """Process kwargs from main for process_files

    Parameters
    ----------
    kwargs : dict
        Keyword arguments for main. See main.create_arg_parser for
        valid arguments
    add_print_callback : bool
        If true, add simple print function at the start of each process_file
        call

    Returns
    ----------
    dict
        Returns a dict containing only keywords applicable to process_files, or
        an empty dict if "init_directory" is missing or "print_version" is
        True and "init_directory" is missing
    """
    if not kwargs["init_directory"] or len(kwargs) < 2:
        if kwargs["print_version"]:
            print("IMRT-QA-Data-Miner: IQDM-PDF v%s" % __version__)
        else:
            print("Initial directory not provided!")
        return {}

    if add_print_callback:
        kwargs["callback"] = print_callback

    # command line args are strings
    try:
        kwargs["processes"] = int(float(kwargs["processes"]))
    except Exception:
        kwargs["processes"] = 1

    keys = [
        "init_directory",
        "ignore_extension",
        "output_file",
        "output_dir",
        "no_recursive_search",
        "raise_errors",
        "callback",
        "processes",
    ]
    return {key: kwargs[key] for key in keys if key in list(kwargs)}


[docs]def print_callback(msg):
    """Simple print callback for process_files

    Parameters
    ----------
    msg : dict
        The message sent from process_files
    """
    print(msg["label"])