"""PDF AcroForm field detection and extraction. Used to decide whether an uploaded PDF should be treated as a fillable form (routed to the pdf_form document type) versus a regular text PDF (routed through document_processor._process_pdf). """ import logging import re from typing import Any # PyMuPDF is an OPTIONAL dependency (AGPL-3.0), required ONLY for the PDF # form-filling feature implemented in this module. The MIT core imports fine # without it; calling these functions without PyMuPDF raises a clear error. # See requirements-optional.txt. try: import fitz # PyMuPDF — optional, AGPL-3.0 except ImportError: # pragma: no cover fitz = None logger = logging.getLogger(__name__) _PYMUPDF_MISSING = ( "PDF form features require PyMuPDF, an optional dependency. Install it with " "`pip install -r requirements-optional.txt` (note: PyMuPDF is AGPL-3.0)." ) def _require_fitz(): """Raise a clear error if the optional PyMuPDF dependency is absent.""" if fitz is None: raise RuntimeError(_PYMUPDF_MISSING) return fitz def _widget_type_names() -> dict: return { fitz.PDF_WIDGET_TYPE_UNKNOWN: "unknown", fitz.PDF_WIDGET_TYPE_BUTTON: "button", fitz.PDF_WIDGET_TYPE_CHECKBOX: "checkbox", fitz.PDF_WIDGET_TYPE_RADIOBUTTON: "radio", fitz.PDF_WIDGET_TYPE_TEXT: "text", fitz.PDF_WIDGET_TYPE_LISTBOX: "listbox", fitz.PDF_WIDGET_TYPE_COMBOBOX: "combobox", fitz.PDF_WIDGET_TYPE_SIGNATURE: "signature", } # Text widgets that are really signature placeholders. Covers DocuSign-style # "_es_:signature" and the bare "signed N" / "Signature" patterns common in # UK conveyancing forms (TA6, TA10). Uses substring match deliberately — # false positives like "assigned" are rare in form-field names. _SIGNATURE_NAME_RE = re.compile(r'sign(?:ed|ature)', re.IGNORECASE) def has_form_fields(path: str) -> bool: """Return True if the PDF looks like a *fillable form* — not just a content PDF that happens to carry a stray widget. Excel-exported PDFs (Japanese estimates, invoices, etc.) often ship with one or two orphan AcroForm widgets (a signature stamp box, a leftover text field from the source template) even when they're really content-only documents. Treating those as forms routes them through the form-fill chat prompt that ASKS the user which field to edit instead of discussing the content — which is exactly the bug we're trying to avoid. Heuristic: require at least 3 non-signature widgets. Signature-only PDFs (e.g. a contract with one sign-here box) read as content, and tiny stray-widget counts no longer hijack the chat. Genuine UK conveyancing forms (TA6, TA10) and similar carry dozens of widgets and still trip this threshold easily. """ _require_fitz() try: doc = fitz.open(path) except Exception as e: logger.warning(f"Could not open PDF {path} for form detection: {e}") return False try: non_signature_count = 0 for page in doc: for w in page.widgets() or []: if w.field_type != fitz.PDF_WIDGET_TYPE_SIGNATURE: non_signature_count += 1 if non_signature_count >= 3: return True return False finally: doc.close() def _infer_label(page: "fitz.Page", rect: "fitz.Rect", page_words: list) -> str: """Best-effort label inference from text near a widget. Strategy: prefer text immediately to the left on the same line, then text immediately above. Returns the closest non-empty match or "" if nothing useful is found. AcroForm field_label is rarely populated in real-world forms, so this fallback matters. """ candidates_left = [] candidates_above = [] line_tol = max(2.0, rect.height * 0.6) for w in page_words: wx0, wy0, wx1, wy1, text = w[0], w[1], w[2], w[3], w[4] if not text.strip(): continue # Same line, to the left if abs((wy0 + wy1) / 2 - (rect.y0 + rect.y1) / 2) < line_tol and wx1 <= rect.x0 + 1: candidates_left.append((rect.x0 - wx1, wx0, text)) # Above, horizontally overlapping elif wy1 <= rect.y0 + 1 and not (wx1 < rect.x0 or wx0 > rect.x1): candidates_above.append((rect.y0 - wy1, wx0, text)) def _join_nearest(cands, gap_limit): if not cands: return "" cands.sort(key=lambda c: (c[0], c[1])) nearest_dist = cands[0][0] if nearest_dist > gap_limit: return "" same = [c for c in cands if c[0] - nearest_dist < line_tol] same.sort(key=lambda c: c[1]) return " ".join(c[2] for c in same).strip() label = _join_nearest(candidates_left, gap_limit=200.0) if label: return label return _join_nearest(candidates_above, gap_limit=40.0) def _widget_on_state(w) -> str: try: return w.on_state() or "" except Exception: return "" def extract_fields(path: str) -> list[dict[str, Any]]: """Enumerate form fields, one entry per unique field name. Multiple checkbox widgets sharing a field name are treated as a single "choice" field whose options are each widget's on-state — that's the PDF idiom for radio-style "Included / Excluded / None" rows. Returns dicts with: name, type, label, value, options, page (1-indexed), rect (x0,y0,x1,y1) for the first widget in the group, required. """ _require_fitz() names = _widget_type_names() grouped: dict[str, dict[str, Any]] = {} order: list[str] = [] try: doc = fitz.open(path) except Exception as e: logger.error(f"Could not open PDF {path} for field extraction: {e}") return [] try: for page_index, page in enumerate(doc): widgets = page.widgets() or [] if not widgets: continue words = page.get_text("words") for w in widgets: name = w.field_name or "" if not name: continue wtype = names.get(w.field_type, "unknown") label = (getattr(w, "field_label", None) or "").strip() if not label: label = _infer_label(page, w.rect, words) value = w.field_value if w.field_value is not None else "" on_state = _widget_on_state(w) if wtype == "checkbox" else "" if name not in grouped: # AdobeSign-style signature placeholders are stored as # plain text widgets but named with `_es_:signature`. if wtype == "text" and _SIGNATURE_NAME_RE.search(name): wtype = "signature" order.append(name) grouped[name] = { "name": name, "type": wtype, "label": label, "value": value, "options": list(w.choice_values) if w.choice_values else ( [on_state] if on_state else [] ), "page": page_index + 1, "rect": [w.rect.x0, w.rect.y0, w.rect.x1, w.rect.y1], "required": bool((w.field_flags or 0) & 2), "_on_states": [on_state] if on_state else [], } else: g = grouped[name] if not g["label"] and label: g["label"] = label if value and not g["value"]: g["value"] = value if on_state and on_state not in g["_on_states"]: g["_on_states"].append(on_state) if on_state not in g["options"]: g["options"].append(on_state) # If a checkbox name appears more than once with different on-states, # promote it to a choice field. if wtype == "checkbox" and len(g["_on_states"]) > 1: g["type"] = "choice" finally: doc.close() out = [] for name in order: g = grouped[name] g.pop("_on_states", None) out.append(g) return out def stamp_signatures( pdf_path: str, output_path: str, stamps: dict[str, bytes], ) -> int: """Stamp PNG signature images into the PDF at each named field's rect. `stamps` is {field_name: png_bytes}. Each named field is found in the AcroForm; the image is drawn into the field's rectangle preserving aspect ratio. The widget itself is left intact (still a form field) so it can be re-edited later if needed; the stamp is rendered on top. Returns the number of stamps written. Pass the source PDF (or an already-filled output from fill_fields) and a fresh output_path. """ if not stamps: return 0 _require_fitz() doc = fitz.open(pdf_path) written = 0 try: for page in doc: for w in page.widgets() or []: name = w.field_name if name not in stamps: continue png = stamps[name] if not png: continue try: page.insert_image(w.rect, stream=png, keep_proportion=True, overlay=True) written += 1 except Exception as e: logger.warning(f"Failed to stamp signature into {name}: {e}") doc.save(output_path, incremental=False, deflate=True) finally: doc.close() return written def stamp_annotations( pdf_path: str, output_path: str, annotations: list[dict], signature_pngs: dict[str, bytes] | None = None, ) -> int: """Burn freeform annotations (text, check, signature) onto a PDF. Each annotation has page-percentage coords (x, y, w, h: 0–100), a `kind` in {text, check, signature}, a string value, and a line_height for text. Returns the number of annotations stamped. """ if not annotations: return 0 _require_fitz() signature_pngs = signature_pngs or {} doc = fitz.open(pdf_path) written = 0 try: for ann in annotations: try: page_no = int(ann.get("page") or 1) if page_no < 1 or page_no > doc.page_count: continue page = doc[page_no - 1] pw, ph = page.rect.width, page.rect.height x = float(ann.get("x", 0)) / 100.0 * pw y = float(ann.get("y", 0)) / 100.0 * ph w = float(ann.get("w", 0)) / 100.0 * pw h = float(ann.get("h", 0)) / 100.0 * ph rect = fitz.Rect(x, y, x + w, y + h) kind = ann.get("kind", "text") value = ann.get("value", "") if kind == "text": if not value: continue line_height = float(ann.get("line_height") or 1.3) lines = value.split("\n") # Fixed point size — keeps text consistent across boxes # regardless of how each was resized. Per HTML metrics the # baseline of a line box sits at fontsize × (lh + 0.6) / 2 # from the line-box top (half the leading above the glyph, # half below, ascent ≈ 0.8 × fontsize). fontsize = 11.0 # Stride between lines is tuned to match what the editor # shows: the editor's textarea renders text larger than # 11pt (cqh-based ≈ 1.5% of page-image height ≈ 17pt for # Letter), so its rows are spaced wider than 11 × lh on # the page. Multiply the export stride to compensate. line_box = fontsize * line_height * 1.2 # First baseline at one ascent below the box top — closest # match to where the editor's first line of text appears. yy = y + fontsize * 0.85 # Match the textarea's 4px left padding (~3 PDF points). xx = x + 3.0 for line in lines: try: page.insert_text( (xx, yy), line, fontsize=fontsize, color=(0, 0, 0), ) except Exception as e: logger.warning(f"insert_text failed for annotation: {e}") yy += line_box written += 1 elif kind == "check": # Draw a checkmark stroke that fills the box. cx = x + w / 2.0 cy = y + h / 2.0 size = min(w, h) * 0.85 p1 = fitz.Point(cx - size * 0.40, cy + size * 0.05) p2 = fitz.Point(cx - size * 0.10, cy + size * 0.30) p3 = fitz.Point(cx + size * 0.45, cy - size * 0.30) shape = page.new_shape() shape.draw_polyline([p1, p2, p3]) shape.finish( color=(0, 0, 0), width=max(1.0, size * 0.13), lineCap=1, lineJoin=1, ) shape.commit() written += 1 elif kind == "signature": if not isinstance(value, str) or not value.startswith("signature:"): continue sid = value[len("signature:"):].strip() png = signature_pngs.get(sid) if not png: continue try: page.insert_image(rect, stream=png, keep_proportion=True, overlay=True) written += 1 except Exception as e: logger.warning(f"signature stamp failed: {e}") except Exception as e: logger.warning(f"Failed to stamp annotation {ann.get('id')}: {e}") continue doc.save(output_path, incremental=False, deflate=True) finally: doc.close() return written def fill_fields(source_path: str, output_path: str, values: dict[str, Any]) -> int: """Write values back into the AcroForm and save a new PDF. Returns the number of fields updated. Unknown field names are ignored. Layout of the source PDF is preserved. """ _require_fitz() doc = fitz.open(source_path) updated = 0 try: for page in doc: for w in page.widgets() or []: name = w.field_name if name not in values: continue new_value = values[name] if w.field_type == fitz.PDF_WIDGET_TYPE_CHECKBOX: on_state = _widget_on_state(w) if isinstance(new_value, bool): # Single checkbox: bool semantics w.field_value = (on_state or "Yes") if new_value else "Off" else: # Choice/radio group: only the widget whose on_state matches # gets that on_state; the rest go Off. chosen = "" if new_value is None else str(new_value).strip() w.field_value = on_state if on_state and on_state == chosen else "Off" else: w.field_value = "" if new_value is None else str(new_value) w.update() updated += 1 doc.save(output_path, incremental=False, deflate=True) finally: doc.close() return updated