402 lines
16 KiB
Python
402 lines
16 KiB
Python
"""PDF AcroForm field detection and extraction.
|
||
|
||
Used to decide whether an uploaded PDF should be treated as a fillable form
|
||
(routed to the pdf_form document type) versus a regular text PDF (routed
|
||
through document_processor._process_pdf).
|
||
"""
|
||
|
||
import logging
|
||
import re
|
||
from typing import Any
|
||
|
||
# PyMuPDF is an OPTIONAL dependency (AGPL-3.0), required ONLY for the PDF
|
||
# form-filling feature implemented in this module. The MIT core imports fine
|
||
# without it; calling these functions without PyMuPDF raises a clear error.
|
||
# See requirements-optional.txt.
|
||
try:
|
||
import fitz # PyMuPDF — optional, AGPL-3.0
|
||
except ImportError: # pragma: no cover
|
||
fitz = None
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
_PYMUPDF_MISSING = (
|
||
"PDF form features require PyMuPDF, an optional dependency. Install it with "
|
||
"`pip install -r requirements-optional.txt` (note: PyMuPDF is AGPL-3.0)."
|
||
)
|
||
|
||
|
||
def _require_fitz():
|
||
"""Raise a clear error if the optional PyMuPDF dependency is absent."""
|
||
if fitz is None:
|
||
raise RuntimeError(_PYMUPDF_MISSING)
|
||
return fitz
|
||
|
||
|
||
def _widget_type_names() -> dict:
|
||
return {
|
||
fitz.PDF_WIDGET_TYPE_UNKNOWN: "unknown",
|
||
fitz.PDF_WIDGET_TYPE_BUTTON: "button",
|
||
fitz.PDF_WIDGET_TYPE_CHECKBOX: "checkbox",
|
||
fitz.PDF_WIDGET_TYPE_RADIOBUTTON: "radio",
|
||
fitz.PDF_WIDGET_TYPE_TEXT: "text",
|
||
fitz.PDF_WIDGET_TYPE_LISTBOX: "listbox",
|
||
fitz.PDF_WIDGET_TYPE_COMBOBOX: "combobox",
|
||
fitz.PDF_WIDGET_TYPE_SIGNATURE: "signature",
|
||
}
|
||
|
||
# Text widgets that are really signature placeholders. Covers DocuSign-style
|
||
# "_es_:signature" and the bare "signed N" / "Signature" patterns common in
|
||
# UK conveyancing forms (TA6, TA10). Uses substring match deliberately —
|
||
# false positives like "assigned" are rare in form-field names.
|
||
_SIGNATURE_NAME_RE = re.compile(r'sign(?:ed|ature)', re.IGNORECASE)
|
||
|
||
|
||
def has_form_fields(path: str) -> bool:
|
||
"""Return True if the PDF looks like a *fillable form* — not just a
|
||
content PDF that happens to carry a stray widget.
|
||
|
||
Excel-exported PDFs (Japanese estimates, invoices, etc.) often ship with
|
||
one or two orphan AcroForm widgets (a signature stamp box, a leftover
|
||
text field from the source template) even when they're really
|
||
content-only documents. Treating those as forms routes them through the
|
||
form-fill chat prompt that ASKS the user which field to edit instead of
|
||
discussing the content — which is exactly the bug we're trying to avoid.
|
||
|
||
Heuristic: require at least 3 non-signature widgets. Signature-only
|
||
PDFs (e.g. a contract with one sign-here box) read as content, and tiny
|
||
stray-widget counts no longer hijack the chat. Genuine UK conveyancing
|
||
forms (TA6, TA10) and similar carry dozens of widgets and still trip
|
||
this threshold easily.
|
||
"""
|
||
_require_fitz()
|
||
try:
|
||
doc = fitz.open(path)
|
||
except Exception as e:
|
||
logger.warning(f"Could not open PDF {path} for form detection: {e}")
|
||
return False
|
||
try:
|
||
non_signature_count = 0
|
||
for page in doc:
|
||
for w in page.widgets() or []:
|
||
if w.field_type != fitz.PDF_WIDGET_TYPE_SIGNATURE:
|
||
non_signature_count += 1
|
||
if non_signature_count >= 3:
|
||
return True
|
||
return False
|
||
finally:
|
||
doc.close()
|
||
|
||
|
||
def _infer_label(page: "fitz.Page", rect: "fitz.Rect", page_words: list) -> str:
|
||
"""Best-effort label inference from text near a widget.
|
||
|
||
Strategy: prefer text immediately to the left on the same line,
|
||
then text immediately above. Returns the closest non-empty match
|
||
or "" if nothing useful is found. AcroForm field_label is rarely
|
||
populated in real-world forms, so this fallback matters.
|
||
"""
|
||
candidates_left = []
|
||
candidates_above = []
|
||
line_tol = max(2.0, rect.height * 0.6)
|
||
|
||
for w in page_words:
|
||
wx0, wy0, wx1, wy1, text = w[0], w[1], w[2], w[3], w[4]
|
||
if not text.strip():
|
||
continue
|
||
# Same line, to the left
|
||
if abs((wy0 + wy1) / 2 - (rect.y0 + rect.y1) / 2) < line_tol and wx1 <= rect.x0 + 1:
|
||
candidates_left.append((rect.x0 - wx1, wx0, text))
|
||
# Above, horizontally overlapping
|
||
elif wy1 <= rect.y0 + 1 and not (wx1 < rect.x0 or wx0 > rect.x1):
|
||
candidates_above.append((rect.y0 - wy1, wx0, text))
|
||
|
||
def _join_nearest(cands, gap_limit):
|
||
if not cands:
|
||
return ""
|
||
cands.sort(key=lambda c: (c[0], c[1]))
|
||
nearest_dist = cands[0][0]
|
||
if nearest_dist > gap_limit:
|
||
return ""
|
||
same = [c for c in cands if c[0] - nearest_dist < line_tol]
|
||
same.sort(key=lambda c: c[1])
|
||
return " ".join(c[2] for c in same).strip()
|
||
|
||
label = _join_nearest(candidates_left, gap_limit=200.0)
|
||
if label:
|
||
return label
|
||
return _join_nearest(candidates_above, gap_limit=40.0)
|
||
|
||
|
||
def _widget_on_state(w) -> str:
|
||
try:
|
||
return w.on_state() or ""
|
||
except Exception:
|
||
return ""
|
||
|
||
|
||
def extract_fields(path: str) -> list[dict[str, Any]]:
|
||
"""Enumerate form fields, one entry per unique field name.
|
||
|
||
Multiple checkbox widgets sharing a field name are treated as a single
|
||
"choice" field whose options are each widget's on-state — that's the
|
||
PDF idiom for radio-style "Included / Excluded / None" rows.
|
||
|
||
Returns dicts with: name, type, label, value, options, page (1-indexed),
|
||
rect (x0,y0,x1,y1) for the first widget in the group, required.
|
||
"""
|
||
_require_fitz()
|
||
names = _widget_type_names()
|
||
grouped: dict[str, dict[str, Any]] = {}
|
||
order: list[str] = []
|
||
try:
|
||
doc = fitz.open(path)
|
||
except Exception as e:
|
||
logger.error(f"Could not open PDF {path} for field extraction: {e}")
|
||
return []
|
||
|
||
try:
|
||
for page_index, page in enumerate(doc):
|
||
widgets = page.widgets() or []
|
||
if not widgets:
|
||
continue
|
||
words = page.get_text("words")
|
||
for w in widgets:
|
||
name = w.field_name or ""
|
||
if not name:
|
||
continue
|
||
wtype = names.get(w.field_type, "unknown")
|
||
label = (getattr(w, "field_label", None) or "").strip()
|
||
if not label:
|
||
label = _infer_label(page, w.rect, words)
|
||
value = w.field_value if w.field_value is not None else ""
|
||
on_state = _widget_on_state(w) if wtype == "checkbox" else ""
|
||
|
||
if name not in grouped:
|
||
# AdobeSign-style signature placeholders are stored as
|
||
# plain text widgets but named with `_es_:signature`.
|
||
if wtype == "text" and _SIGNATURE_NAME_RE.search(name):
|
||
wtype = "signature"
|
||
order.append(name)
|
||
grouped[name] = {
|
||
"name": name,
|
||
"type": wtype,
|
||
"label": label,
|
||
"value": value,
|
||
"options": list(w.choice_values) if w.choice_values else (
|
||
[on_state] if on_state else []
|
||
),
|
||
"page": page_index + 1,
|
||
"rect": [w.rect.x0, w.rect.y0, w.rect.x1, w.rect.y1],
|
||
"required": bool((w.field_flags or 0) & 2),
|
||
"_on_states": [on_state] if on_state else [],
|
||
}
|
||
else:
|
||
g = grouped[name]
|
||
if not g["label"] and label:
|
||
g["label"] = label
|
||
if value and not g["value"]:
|
||
g["value"] = value
|
||
if on_state and on_state not in g["_on_states"]:
|
||
g["_on_states"].append(on_state)
|
||
if on_state not in g["options"]:
|
||
g["options"].append(on_state)
|
||
# If a checkbox name appears more than once with different on-states,
|
||
# promote it to a choice field.
|
||
if wtype == "checkbox" and len(g["_on_states"]) > 1:
|
||
g["type"] = "choice"
|
||
finally:
|
||
doc.close()
|
||
|
||
out = []
|
||
for name in order:
|
||
g = grouped[name]
|
||
g.pop("_on_states", None)
|
||
out.append(g)
|
||
return out
|
||
|
||
|
||
def stamp_signatures(
|
||
pdf_path: str,
|
||
output_path: str,
|
||
stamps: dict[str, bytes],
|
||
) -> int:
|
||
"""Stamp PNG signature images into the PDF at each named field's rect.
|
||
|
||
`stamps` is {field_name: png_bytes}. Each named field is found in the
|
||
AcroForm; the image is drawn into the field's rectangle preserving aspect
|
||
ratio. The widget itself is left intact (still a form field) so it can be
|
||
re-edited later if needed; the stamp is rendered on top.
|
||
|
||
Returns the number of stamps written. Pass the source PDF (or an
|
||
already-filled output from fill_fields) and a fresh output_path.
|
||
"""
|
||
if not stamps:
|
||
return 0
|
||
_require_fitz()
|
||
doc = fitz.open(pdf_path)
|
||
written = 0
|
||
try:
|
||
for page in doc:
|
||
for w in page.widgets() or []:
|
||
name = w.field_name
|
||
if name not in stamps:
|
||
continue
|
||
png = stamps[name]
|
||
if not png:
|
||
continue
|
||
try:
|
||
page.insert_image(w.rect, stream=png, keep_proportion=True, overlay=True)
|
||
written += 1
|
||
except Exception as e:
|
||
logger.warning(f"Failed to stamp signature into {name}: {e}")
|
||
doc.save(output_path, incremental=False, deflate=True)
|
||
finally:
|
||
doc.close()
|
||
return written
|
||
|
||
|
||
def stamp_annotations(
|
||
pdf_path: str,
|
||
output_path: str,
|
||
annotations: list[dict],
|
||
signature_pngs: dict[str, bytes] | None = None,
|
||
) -> int:
|
||
"""Burn freeform annotations (text, check, signature) onto a PDF.
|
||
|
||
Each annotation has page-percentage coords (x, y, w, h: 0–100), a `kind`
|
||
in {text, check, signature}, a string value, and a line_height for text.
|
||
Returns the number of annotations stamped.
|
||
"""
|
||
if not annotations:
|
||
return 0
|
||
_require_fitz()
|
||
signature_pngs = signature_pngs or {}
|
||
doc = fitz.open(pdf_path)
|
||
written = 0
|
||
try:
|
||
for ann in annotations:
|
||
try:
|
||
page_no = int(ann.get("page") or 1)
|
||
if page_no < 1 or page_no > doc.page_count:
|
||
continue
|
||
page = doc[page_no - 1]
|
||
pw, ph = page.rect.width, page.rect.height
|
||
x = float(ann.get("x", 0)) / 100.0 * pw
|
||
y = float(ann.get("y", 0)) / 100.0 * ph
|
||
w = float(ann.get("w", 0)) / 100.0 * pw
|
||
h = float(ann.get("h", 0)) / 100.0 * ph
|
||
rect = fitz.Rect(x, y, x + w, y + h)
|
||
kind = ann.get("kind", "text")
|
||
value = ann.get("value", "")
|
||
|
||
if kind == "text":
|
||
if not value:
|
||
continue
|
||
line_height = float(ann.get("line_height") or 1.3)
|
||
lines = value.split("\n")
|
||
# Fixed point size — keeps text consistent across boxes
|
||
# regardless of how each was resized. Per HTML metrics the
|
||
# baseline of a line box sits at fontsize × (lh + 0.6) / 2
|
||
# from the line-box top (half the leading above the glyph,
|
||
# half below, ascent ≈ 0.8 × fontsize).
|
||
fontsize = 11.0
|
||
# Stride between lines is tuned to match what the editor
|
||
# shows: the editor's textarea renders text larger than
|
||
# 11pt (cqh-based ≈ 1.5% of page-image height ≈ 17pt for
|
||
# Letter), so its rows are spaced wider than 11 × lh on
|
||
# the page. Multiply the export stride to compensate.
|
||
line_box = fontsize * line_height * 1.2
|
||
# First baseline at one ascent below the box top — closest
|
||
# match to where the editor's first line of text appears.
|
||
yy = y + fontsize * 0.85
|
||
# Match the textarea's 4px left padding (~3 PDF points).
|
||
xx = x + 3.0
|
||
for line in lines:
|
||
try:
|
||
page.insert_text(
|
||
(xx, yy),
|
||
line,
|
||
fontsize=fontsize,
|
||
color=(0, 0, 0),
|
||
)
|
||
except Exception as e:
|
||
logger.warning(f"insert_text failed for annotation: {e}")
|
||
yy += line_box
|
||
written += 1
|
||
|
||
elif kind == "check":
|
||
# Draw a checkmark stroke that fills the box.
|
||
cx = x + w / 2.0
|
||
cy = y + h / 2.0
|
||
size = min(w, h) * 0.85
|
||
p1 = fitz.Point(cx - size * 0.40, cy + size * 0.05)
|
||
p2 = fitz.Point(cx - size * 0.10, cy + size * 0.30)
|
||
p3 = fitz.Point(cx + size * 0.45, cy - size * 0.30)
|
||
shape = page.new_shape()
|
||
shape.draw_polyline([p1, p2, p3])
|
||
shape.finish(
|
||
color=(0, 0, 0),
|
||
width=max(1.0, size * 0.13),
|
||
lineCap=1,
|
||
lineJoin=1,
|
||
)
|
||
shape.commit()
|
||
written += 1
|
||
|
||
elif kind == "signature":
|
||
if not isinstance(value, str) or not value.startswith("signature:"):
|
||
continue
|
||
sid = value[len("signature:"):].strip()
|
||
png = signature_pngs.get(sid)
|
||
if not png:
|
||
continue
|
||
try:
|
||
page.insert_image(rect, stream=png, keep_proportion=True, overlay=True)
|
||
written += 1
|
||
except Exception as e:
|
||
logger.warning(f"signature stamp failed: {e}")
|
||
except Exception as e:
|
||
logger.warning(f"Failed to stamp annotation {ann.get('id')}: {e}")
|
||
continue
|
||
doc.save(output_path, incremental=False, deflate=True)
|
||
finally:
|
||
doc.close()
|
||
return written
|
||
|
||
|
||
def fill_fields(source_path: str, output_path: str, values: dict[str, Any]) -> int:
|
||
"""Write values back into the AcroForm and save a new PDF.
|
||
|
||
Returns the number of fields updated. Unknown field names are ignored.
|
||
Layout of the source PDF is preserved.
|
||
"""
|
||
_require_fitz()
|
||
doc = fitz.open(source_path)
|
||
updated = 0
|
||
try:
|
||
for page in doc:
|
||
for w in page.widgets() or []:
|
||
name = w.field_name
|
||
if name not in values:
|
||
continue
|
||
new_value = values[name]
|
||
if w.field_type == fitz.PDF_WIDGET_TYPE_CHECKBOX:
|
||
on_state = _widget_on_state(w)
|
||
if isinstance(new_value, bool):
|
||
# Single checkbox: bool semantics
|
||
w.field_value = (on_state or "Yes") if new_value else "Off"
|
||
else:
|
||
# Choice/radio group: only the widget whose on_state matches
|
||
# gets that on_state; the rest go Off.
|
||
chosen = "" if new_value is None else str(new_value).strip()
|
||
w.field_value = on_state if on_state and on_state == chosen else "Off"
|
||
else:
|
||
w.field_value = "" if new_value is None else str(new_value)
|
||
w.update()
|
||
updated += 1
|
||
doc.save(output_path, incremental=False, deflate=True)
|
||
finally:
|
||
doc.close()
|
||
return updated
|