Odysseus v1.0
This commit is contained in:
401
src/pdf_forms.py
Normal file
401
src/pdf_forms.py
Normal file
@@ -0,0 +1,401 @@
|
||||
"""PDF AcroForm field detection and extraction.
|
||||
|
||||
Used to decide whether an uploaded PDF should be treated as a fillable form
|
||||
(routed to the pdf_form document type) versus a regular text PDF (routed
|
||||
through document_processor._process_pdf).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
# PyMuPDF is an OPTIONAL dependency (AGPL-3.0), required ONLY for the PDF
|
||||
# form-filling feature implemented in this module. The MIT core imports fine
|
||||
# without it; calling these functions without PyMuPDF raises a clear error.
|
||||
# See requirements-optional.txt.
|
||||
try:
|
||||
import fitz # PyMuPDF — optional, AGPL-3.0
|
||||
except ImportError: # pragma: no cover
|
||||
fitz = None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_PYMUPDF_MISSING = (
|
||||
"PDF form features require PyMuPDF, an optional dependency. Install it with "
|
||||
"`pip install -r requirements-optional.txt` (note: PyMuPDF is AGPL-3.0)."
|
||||
)
|
||||
|
||||
|
||||
def _require_fitz():
|
||||
"""Raise a clear error if the optional PyMuPDF dependency is absent."""
|
||||
if fitz is None:
|
||||
raise RuntimeError(_PYMUPDF_MISSING)
|
||||
return fitz
|
||||
|
||||
|
||||
def _widget_type_names() -> dict:
|
||||
return {
|
||||
fitz.PDF_WIDGET_TYPE_UNKNOWN: "unknown",
|
||||
fitz.PDF_WIDGET_TYPE_BUTTON: "button",
|
||||
fitz.PDF_WIDGET_TYPE_CHECKBOX: "checkbox",
|
||||
fitz.PDF_WIDGET_TYPE_RADIOBUTTON: "radio",
|
||||
fitz.PDF_WIDGET_TYPE_TEXT: "text",
|
||||
fitz.PDF_WIDGET_TYPE_LISTBOX: "listbox",
|
||||
fitz.PDF_WIDGET_TYPE_COMBOBOX: "combobox",
|
||||
fitz.PDF_WIDGET_TYPE_SIGNATURE: "signature",
|
||||
}
|
||||
|
||||
# Text widgets that are really signature placeholders. Covers DocuSign-style
|
||||
# "_es_:signature" and the bare "signed N" / "Signature" patterns common in
|
||||
# UK conveyancing forms (TA6, TA10). Uses substring match deliberately —
|
||||
# false positives like "assigned" are rare in form-field names.
|
||||
_SIGNATURE_NAME_RE = re.compile(r'sign(?:ed|ature)', re.IGNORECASE)
|
||||
|
||||
|
||||
def has_form_fields(path: str) -> bool:
|
||||
"""Return True if the PDF looks like a *fillable form* — not just a
|
||||
content PDF that happens to carry a stray widget.
|
||||
|
||||
Excel-exported PDFs (Japanese estimates, invoices, etc.) often ship with
|
||||
one or two orphan AcroForm widgets (a signature stamp box, a leftover
|
||||
text field from the source template) even when they're really
|
||||
content-only documents. Treating those as forms routes them through the
|
||||
form-fill chat prompt that ASKS the user which field to edit instead of
|
||||
discussing the content — which is exactly the bug we're trying to avoid.
|
||||
|
||||
Heuristic: require at least 3 non-signature widgets. Signature-only
|
||||
PDFs (e.g. a contract with one sign-here box) read as content, and tiny
|
||||
stray-widget counts no longer hijack the chat. Genuine UK conveyancing
|
||||
forms (TA6, TA10) and similar carry dozens of widgets and still trip
|
||||
this threshold easily.
|
||||
"""
|
||||
_require_fitz()
|
||||
try:
|
||||
doc = fitz.open(path)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not open PDF {path} for form detection: {e}")
|
||||
return False
|
||||
try:
|
||||
non_signature_count = 0
|
||||
for page in doc:
|
||||
for w in page.widgets() or []:
|
||||
if w.field_type != fitz.PDF_WIDGET_TYPE_SIGNATURE:
|
||||
non_signature_count += 1
|
||||
if non_signature_count >= 3:
|
||||
return True
|
||||
return False
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
|
||||
def _infer_label(page: "fitz.Page", rect: "fitz.Rect", page_words: list) -> str:
|
||||
"""Best-effort label inference from text near a widget.
|
||||
|
||||
Strategy: prefer text immediately to the left on the same line,
|
||||
then text immediately above. Returns the closest non-empty match
|
||||
or "" if nothing useful is found. AcroForm field_label is rarely
|
||||
populated in real-world forms, so this fallback matters.
|
||||
"""
|
||||
candidates_left = []
|
||||
candidates_above = []
|
||||
line_tol = max(2.0, rect.height * 0.6)
|
||||
|
||||
for w in page_words:
|
||||
wx0, wy0, wx1, wy1, text = w[0], w[1], w[2], w[3], w[4]
|
||||
if not text.strip():
|
||||
continue
|
||||
# Same line, to the left
|
||||
if abs((wy0 + wy1) / 2 - (rect.y0 + rect.y1) / 2) < line_tol and wx1 <= rect.x0 + 1:
|
||||
candidates_left.append((rect.x0 - wx1, wx0, text))
|
||||
# Above, horizontally overlapping
|
||||
elif wy1 <= rect.y0 + 1 and not (wx1 < rect.x0 or wx0 > rect.x1):
|
||||
candidates_above.append((rect.y0 - wy1, wx0, text))
|
||||
|
||||
def _join_nearest(cands, gap_limit):
|
||||
if not cands:
|
||||
return ""
|
||||
cands.sort(key=lambda c: (c[0], c[1]))
|
||||
nearest_dist = cands[0][0]
|
||||
if nearest_dist > gap_limit:
|
||||
return ""
|
||||
same = [c for c in cands if c[0] - nearest_dist < line_tol]
|
||||
same.sort(key=lambda c: c[1])
|
||||
return " ".join(c[2] for c in same).strip()
|
||||
|
||||
label = _join_nearest(candidates_left, gap_limit=200.0)
|
||||
if label:
|
||||
return label
|
||||
return _join_nearest(candidates_above, gap_limit=40.0)
|
||||
|
||||
|
||||
def _widget_on_state(w) -> str:
|
||||
try:
|
||||
return w.on_state() or ""
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def extract_fields(path: str) -> list[dict[str, Any]]:
|
||||
"""Enumerate form fields, one entry per unique field name.
|
||||
|
||||
Multiple checkbox widgets sharing a field name are treated as a single
|
||||
"choice" field whose options are each widget's on-state — that's the
|
||||
PDF idiom for radio-style "Included / Excluded / None" rows.
|
||||
|
||||
Returns dicts with: name, type, label, value, options, page (1-indexed),
|
||||
rect (x0,y0,x1,y1) for the first widget in the group, required.
|
||||
"""
|
||||
_require_fitz()
|
||||
names = _widget_type_names()
|
||||
grouped: dict[str, dict[str, Any]] = {}
|
||||
order: list[str] = []
|
||||
try:
|
||||
doc = fitz.open(path)
|
||||
except Exception as e:
|
||||
logger.error(f"Could not open PDF {path} for field extraction: {e}")
|
||||
return []
|
||||
|
||||
try:
|
||||
for page_index, page in enumerate(doc):
|
||||
widgets = page.widgets() or []
|
||||
if not widgets:
|
||||
continue
|
||||
words = page.get_text("words")
|
||||
for w in widgets:
|
||||
name = w.field_name or ""
|
||||
if not name:
|
||||
continue
|
||||
wtype = names.get(w.field_type, "unknown")
|
||||
label = (getattr(w, "field_label", None) or "").strip()
|
||||
if not label:
|
||||
label = _infer_label(page, w.rect, words)
|
||||
value = w.field_value if w.field_value is not None else ""
|
||||
on_state = _widget_on_state(w) if wtype == "checkbox" else ""
|
||||
|
||||
if name not in grouped:
|
||||
# AdobeSign-style signature placeholders are stored as
|
||||
# plain text widgets but named with `_es_:signature`.
|
||||
if wtype == "text" and _SIGNATURE_NAME_RE.search(name):
|
||||
wtype = "signature"
|
||||
order.append(name)
|
||||
grouped[name] = {
|
||||
"name": name,
|
||||
"type": wtype,
|
||||
"label": label,
|
||||
"value": value,
|
||||
"options": list(w.choice_values) if w.choice_values else (
|
||||
[on_state] if on_state else []
|
||||
),
|
||||
"page": page_index + 1,
|
||||
"rect": [w.rect.x0, w.rect.y0, w.rect.x1, w.rect.y1],
|
||||
"required": bool((w.field_flags or 0) & 2),
|
||||
"_on_states": [on_state] if on_state else [],
|
||||
}
|
||||
else:
|
||||
g = grouped[name]
|
||||
if not g["label"] and label:
|
||||
g["label"] = label
|
||||
if value and not g["value"]:
|
||||
g["value"] = value
|
||||
if on_state and on_state not in g["_on_states"]:
|
||||
g["_on_states"].append(on_state)
|
||||
if on_state not in g["options"]:
|
||||
g["options"].append(on_state)
|
||||
# If a checkbox name appears more than once with different on-states,
|
||||
# promote it to a choice field.
|
||||
if wtype == "checkbox" and len(g["_on_states"]) > 1:
|
||||
g["type"] = "choice"
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
out = []
|
||||
for name in order:
|
||||
g = grouped[name]
|
||||
g.pop("_on_states", None)
|
||||
out.append(g)
|
||||
return out
|
||||
|
||||
|
||||
def stamp_signatures(
|
||||
pdf_path: str,
|
||||
output_path: str,
|
||||
stamps: dict[str, bytes],
|
||||
) -> int:
|
||||
"""Stamp PNG signature images into the PDF at each named field's rect.
|
||||
|
||||
`stamps` is {field_name: png_bytes}. Each named field is found in the
|
||||
AcroForm; the image is drawn into the field's rectangle preserving aspect
|
||||
ratio. The widget itself is left intact (still a form field) so it can be
|
||||
re-edited later if needed; the stamp is rendered on top.
|
||||
|
||||
Returns the number of stamps written. Pass the source PDF (or an
|
||||
already-filled output from fill_fields) and a fresh output_path.
|
||||
"""
|
||||
if not stamps:
|
||||
return 0
|
||||
_require_fitz()
|
||||
doc = fitz.open(pdf_path)
|
||||
written = 0
|
||||
try:
|
||||
for page in doc:
|
||||
for w in page.widgets() or []:
|
||||
name = w.field_name
|
||||
if name not in stamps:
|
||||
continue
|
||||
png = stamps[name]
|
||||
if not png:
|
||||
continue
|
||||
try:
|
||||
page.insert_image(w.rect, stream=png, keep_proportion=True, overlay=True)
|
||||
written += 1
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to stamp signature into {name}: {e}")
|
||||
doc.save(output_path, incremental=False, deflate=True)
|
||||
finally:
|
||||
doc.close()
|
||||
return written
|
||||
|
||||
|
||||
def stamp_annotations(
|
||||
pdf_path: str,
|
||||
output_path: str,
|
||||
annotations: list[dict],
|
||||
signature_pngs: dict[str, bytes] | None = None,
|
||||
) -> int:
|
||||
"""Burn freeform annotations (text, check, signature) onto a PDF.
|
||||
|
||||
Each annotation has page-percentage coords (x, y, w, h: 0–100), a `kind`
|
||||
in {text, check, signature}, a string value, and a line_height for text.
|
||||
Returns the number of annotations stamped.
|
||||
"""
|
||||
if not annotations:
|
||||
return 0
|
||||
_require_fitz()
|
||||
signature_pngs = signature_pngs or {}
|
||||
doc = fitz.open(pdf_path)
|
||||
written = 0
|
||||
try:
|
||||
for ann in annotations:
|
||||
try:
|
||||
page_no = int(ann.get("page") or 1)
|
||||
if page_no < 1 or page_no > doc.page_count:
|
||||
continue
|
||||
page = doc[page_no - 1]
|
||||
pw, ph = page.rect.width, page.rect.height
|
||||
x = float(ann.get("x", 0)) / 100.0 * pw
|
||||
y = float(ann.get("y", 0)) / 100.0 * ph
|
||||
w = float(ann.get("w", 0)) / 100.0 * pw
|
||||
h = float(ann.get("h", 0)) / 100.0 * ph
|
||||
rect = fitz.Rect(x, y, x + w, y + h)
|
||||
kind = ann.get("kind", "text")
|
||||
value = ann.get("value", "")
|
||||
|
||||
if kind == "text":
|
||||
if not value:
|
||||
continue
|
||||
line_height = float(ann.get("line_height") or 1.3)
|
||||
lines = value.split("\n")
|
||||
# Fixed point size — keeps text consistent across boxes
|
||||
# regardless of how each was resized. Per HTML metrics the
|
||||
# baseline of a line box sits at fontsize × (lh + 0.6) / 2
|
||||
# from the line-box top (half the leading above the glyph,
|
||||
# half below, ascent ≈ 0.8 × fontsize).
|
||||
fontsize = 11.0
|
||||
# Stride between lines is tuned to match what the editor
|
||||
# shows: the editor's textarea renders text larger than
|
||||
# 11pt (cqh-based ≈ 1.5% of page-image height ≈ 17pt for
|
||||
# Letter), so its rows are spaced wider than 11 × lh on
|
||||
# the page. Multiply the export stride to compensate.
|
||||
line_box = fontsize * line_height * 1.2
|
||||
# First baseline at one ascent below the box top — closest
|
||||
# match to where the editor's first line of text appears.
|
||||
yy = y + fontsize * 0.85
|
||||
# Match the textarea's 4px left padding (~3 PDF points).
|
||||
xx = x + 3.0
|
||||
for line in lines:
|
||||
try:
|
||||
page.insert_text(
|
||||
(xx, yy),
|
||||
line,
|
||||
fontsize=fontsize,
|
||||
color=(0, 0, 0),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"insert_text failed for annotation: {e}")
|
||||
yy += line_box
|
||||
written += 1
|
||||
|
||||
elif kind == "check":
|
||||
# Draw a checkmark stroke that fills the box.
|
||||
cx = x + w / 2.0
|
||||
cy = y + h / 2.0
|
||||
size = min(w, h) * 0.85
|
||||
p1 = fitz.Point(cx - size * 0.40, cy + size * 0.05)
|
||||
p2 = fitz.Point(cx - size * 0.10, cy + size * 0.30)
|
||||
p3 = fitz.Point(cx + size * 0.45, cy - size * 0.30)
|
||||
shape = page.new_shape()
|
||||
shape.draw_polyline([p1, p2, p3])
|
||||
shape.finish(
|
||||
color=(0, 0, 0),
|
||||
width=max(1.0, size * 0.13),
|
||||
lineCap=1,
|
||||
lineJoin=1,
|
||||
)
|
||||
shape.commit()
|
||||
written += 1
|
||||
|
||||
elif kind == "signature":
|
||||
if not isinstance(value, str) or not value.startswith("signature:"):
|
||||
continue
|
||||
sid = value[len("signature:"):].strip()
|
||||
png = signature_pngs.get(sid)
|
||||
if not png:
|
||||
continue
|
||||
try:
|
||||
page.insert_image(rect, stream=png, keep_proportion=True, overlay=True)
|
||||
written += 1
|
||||
except Exception as e:
|
||||
logger.warning(f"signature stamp failed: {e}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to stamp annotation {ann.get('id')}: {e}")
|
||||
continue
|
||||
doc.save(output_path, incremental=False, deflate=True)
|
||||
finally:
|
||||
doc.close()
|
||||
return written
|
||||
|
||||
|
||||
def fill_fields(source_path: str, output_path: str, values: dict[str, Any]) -> int:
|
||||
"""Write values back into the AcroForm and save a new PDF.
|
||||
|
||||
Returns the number of fields updated. Unknown field names are ignored.
|
||||
Layout of the source PDF is preserved.
|
||||
"""
|
||||
_require_fitz()
|
||||
doc = fitz.open(source_path)
|
||||
updated = 0
|
||||
try:
|
||||
for page in doc:
|
||||
for w in page.widgets() or []:
|
||||
name = w.field_name
|
||||
if name not in values:
|
||||
continue
|
||||
new_value = values[name]
|
||||
if w.field_type == fitz.PDF_WIDGET_TYPE_CHECKBOX:
|
||||
on_state = _widget_on_state(w)
|
||||
if isinstance(new_value, bool):
|
||||
# Single checkbox: bool semantics
|
||||
w.field_value = (on_state or "Yes") if new_value else "Off"
|
||||
else:
|
||||
# Choice/radio group: only the widget whose on_state matches
|
||||
# gets that on_state; the rest go Off.
|
||||
chosen = "" if new_value is None else str(new_value).strip()
|
||||
w.field_value = on_state if on_state and on_state == chosen else "Off"
|
||||
else:
|
||||
w.field_value = "" if new_value is None else str(new_value)
|
||||
w.update()
|
||||
updated += 1
|
||||
doc.save(output_path, incremental=False, deflate=True)
|
||||
finally:
|
||||
doc.close()
|
||||
return updated
|
||||
Reference in New Issue
Block a user