Skip to content

gmft

Install

shell
pip3 install numpy==1.24.1  # compatible with torch
pip3 install torch torchvision
pip3 install transformers
pip3 install git+https://github.com/conjuncts/gmft_pymupdf.git
pip3 install openpyxl
pip3 install XlsxWriter
pip3 install gmft

Extract Table

The AutoTableDetector is the recommended table detection tool, which currently uses Microsoft’s Table Transformer. They produce CroppedTable objects, from which CroppedTable.image() permits image export.

The AutoTableFormatter is the recommended table formatting tool. All TableFormatters produce FormattedTable objects, which contain the original CroppedTable and the formatted dataframe, and from which FormattedTable.df() permits dataframe export.

python
import importlib
import gmft
import gmft.table_detection
import gmft.table_visualization
import gmft.table_function
import gmft.table_function_algorithm
import gmft.table_captioning
import gmft.pdf_bindings.bindings_pdfium
import gmft.pdf_bindings
import gmft.common
import pandas as pd

importlib.reload(gmft)
importlib.reload(gmft.common)
importlib.reload(gmft.table_captioning)
importlib.reload(gmft.table_detection)
importlib.reload(gmft.table_visualization)
importlib.reload(gmft.table_function)
importlib.reload(gmft.table_function_algorithm)
importlib.reload(gmft.pdf_bindings.bindings_pdfium)
importlib.reload(gmft.pdf_bindings)

from gmft.auto import CroppedTable, AutoTableDetector, AutoTableFormatter
from gmft_pymupdf import PyMuPDFDocument

def ingest_pdf(pdf_path) -> list[CroppedTable]:
    doc = PyMuPDFDocument(pdf_path)
    detector = AutoTableDetector()
    tables = []
    for page in doc:
        tables += detector.extract(page)
    return tables, doc

def pdf2table(pdf_path):
    excel_path = pdf_path.replace(".pdf",'.xlsx')
    tables, doc = ingest_pdf(pdf_path)
    formatter = AutoTableFormatter()
    writer = pd.ExcelWriter(excel_path, engine="xlsxwriter")
    for idx, table in enumerate(tables):
        sheet = "Sheet"+str(idx+1)
        ft = formatter.extract(table)
        ft.df().to_excel(writer,sheet_name=sheet,index=False)
    writer.close()
    return excel_path

pdf2table('./samples/sherry.pdf')

transformers

GMFT uses Microsoft's Table Transformer (TATR), which is trained on a diverse dataset PubTables-1M.

GMFT relies on transformers.

On the first run, gmft downloads Microsoft's TATR from huggingface, which requires ~270mB total and is saved to ~/.cache/huggingface/hub/models--microsoft--table-{transformer-detection, structure-recognition} and ~/.cache/huggingface/hub/models--timm--resnet18.a1_in1k.

References