Appearance
gmft
Install
shell
pip3 install numpy==1.24.1 # compatible with torch
pip3 install torch torchvision
pip3 install transformers
pip3 install git+https://github.com/conjuncts/gmft_pymupdf.git
pip3 install openpyxl
pip3 install XlsxWriter
pip3 install gmft
pip3 install numpy==1.24.1 # compatible with torch
pip3 install torch torchvision
pip3 install transformers
pip3 install git+https://github.com/conjuncts/gmft_pymupdf.git
pip3 install openpyxl
pip3 install XlsxWriter
pip3 install gmft
Extract Table
The AutoTableDetector
is the recommended table detection tool, which currently uses Microsoft’s Table Transformer. They produce CroppedTable
objects, from which CroppedTable.image()
permits image export.
The AutoTableFormatter
is the recommended table formatting tool. All TableFormatters produce FormattedTable
objects, which contain the original CroppedTable and the formatted dataframe, and from which FormattedTable.df()
permits dataframe export.
python
import importlib
import gmft
import gmft.table_detection
import gmft.table_visualization
import gmft.table_function
import gmft.table_function_algorithm
import gmft.table_captioning
import gmft.pdf_bindings.bindings_pdfium
import gmft.pdf_bindings
import gmft.common
import pandas as pd
importlib.reload(gmft)
importlib.reload(gmft.common)
importlib.reload(gmft.table_captioning)
importlib.reload(gmft.table_detection)
importlib.reload(gmft.table_visualization)
importlib.reload(gmft.table_function)
importlib.reload(gmft.table_function_algorithm)
importlib.reload(gmft.pdf_bindings.bindings_pdfium)
importlib.reload(gmft.pdf_bindings)
from gmft.auto import CroppedTable, AutoTableDetector, AutoTableFormatter
from gmft_pymupdf import PyMuPDFDocument
def ingest_pdf(pdf_path) -> list[CroppedTable]:
doc = PyMuPDFDocument(pdf_path)
detector = AutoTableDetector()
tables = []
for page in doc:
tables += detector.extract(page)
return tables, doc
def pdf2table(pdf_path):
excel_path = pdf_path.replace(".pdf",'.xlsx')
tables, doc = ingest_pdf(pdf_path)
formatter = AutoTableFormatter()
writer = pd.ExcelWriter(excel_path, engine="xlsxwriter")
for idx, table in enumerate(tables):
sheet = "Sheet"+str(idx+1)
ft = formatter.extract(table)
ft.df().to_excel(writer,sheet_name=sheet,index=False)
writer.close()
return excel_path
pdf2table('./samples/sherry.pdf')
import importlib
import gmft
import gmft.table_detection
import gmft.table_visualization
import gmft.table_function
import gmft.table_function_algorithm
import gmft.table_captioning
import gmft.pdf_bindings.bindings_pdfium
import gmft.pdf_bindings
import gmft.common
import pandas as pd
importlib.reload(gmft)
importlib.reload(gmft.common)
importlib.reload(gmft.table_captioning)
importlib.reload(gmft.table_detection)
importlib.reload(gmft.table_visualization)
importlib.reload(gmft.table_function)
importlib.reload(gmft.table_function_algorithm)
importlib.reload(gmft.pdf_bindings.bindings_pdfium)
importlib.reload(gmft.pdf_bindings)
from gmft.auto import CroppedTable, AutoTableDetector, AutoTableFormatter
from gmft_pymupdf import PyMuPDFDocument
def ingest_pdf(pdf_path) -> list[CroppedTable]:
doc = PyMuPDFDocument(pdf_path)
detector = AutoTableDetector()
tables = []
for page in doc:
tables += detector.extract(page)
return tables, doc
def pdf2table(pdf_path):
excel_path = pdf_path.replace(".pdf",'.xlsx')
tables, doc = ingest_pdf(pdf_path)
formatter = AutoTableFormatter()
writer = pd.ExcelWriter(excel_path, engine="xlsxwriter")
for idx, table in enumerate(tables):
sheet = "Sheet"+str(idx+1)
ft = formatter.extract(table)
ft.df().to_excel(writer,sheet_name=sheet,index=False)
writer.close()
return excel_path
pdf2table('./samples/sherry.pdf')
transformers
GMFT uses Microsoft's Table Transformer (TATR), which is trained on a diverse dataset PubTables-1M.
GMFT relies on transformers.
On the first run, gmft downloads Microsoft's TATR from huggingface, which requires ~270mB total and is saved to ~/.cache/huggingface/hub/models--microsoft--table-{transformer-detection, structure-recognition}
and ~/.cache/huggingface/hub/models--timm--resnet18.a1_in1k
.