From 1491f30094122af3b04f225760fe00312f70f9d1 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Mon, 11 Nov 2024 16:47:49 +0000 Subject: [PATCH] First stab at %Bookmark --- Changes | 1 + cpdfaddtext.ml | 34 +++++++++++++++++++++++++++++++--- cpdfaddtext.mli | 2 ++ cpdfdraw.ml | 5 ++++- 4 files changed, 38 insertions(+), 4 deletions(-) diff --git a/Changes b/Changes index a7a8d0a..adce670 100644 --- a/Changes +++ b/Changes @@ -15,6 +15,7 @@ o Show OpenAction in -info o Show more form information in -info o Show XFA in -info o Allow JSON / PDF syntax in dict processing and object exploration +o %Bookmark and friends when stamping text Fixes: diff --git a/cpdfaddtext.ml b/cpdfaddtext.ml index 41816eb..7b7db77 100644 --- a/cpdfaddtext.ml +++ b/cpdfaddtext.ml @@ -177,7 +177,24 @@ let pagelabel pdf num = num (Pdfpagelabels.complete (Pdfpagelabels.read pdf)) -let replace_pairs pdf endpage extract_text_font_size filename bates batespad num page = +(* Return UTF8 of current bookmark at given level at start of page. No bookmark + available = empty string. *) +let bookmark marks fastrefnums level pdf num = + let before, _ = + (* 1. Pick all marks up to and including those on the needed page. *) + cleavewhile (fun mark -> Pdfpage.pagenumber_of_target ~fastrefnums pdf mark.Pdfmarks.target <= num) marks + in + match + (* 2. Remove from the list anything up to the last mark which is at higher + level. This prevents sections in an earlier chapter showing up as + bookmarks in a later chapter if no section has yet been introduced in + that chapter. Do this by reversing, then keeping everything up to any higher level. Then re-reverse. *) + rev (fst (cleavewhile (fun mark -> mark.Pdfmarks.level = level) (rev before))) + with + | h::_ -> Pdftext.utf8_of_pdfdocstring h.Pdfmarks.text + | [] -> "" + +let replace_pairs marks fastrefnums pdf endpage extract_text_font_size filename bates batespad num page = [ "%PageDiv2", (fun () -> string_of_int ((num + 1) / 2)); "%Page", (fun () -> string_of_int num); @@ -187,6 +204,11 @@ let replace_pairs pdf endpage extract_text_font_size filename bates batespad num "%Label", (fun () -> pagelabel pdf num); "%EndPage", (fun () -> string_of_int endpage); "%EndLabel", (fun () -> pagelabel pdf endpage); + "%Bookmark0", (fun () -> bookmark marks fastrefnums 0 pdf num); + "%Bookmark1", (fun () -> bookmark marks fastrefnums 1 pdf num); + "%Bookmark2", (fun () -> bookmark marks fastrefnums 2 pdf num); + "%Bookmark3", (fun () -> bookmark marks fastrefnums 3 pdf num); + "%Bookmark4", (fun () -> bookmark marks fastrefnums 4 pdf num); "%ExtractedText", (fun () -> Cpdfextracttext.extract_page_text extract_text_font_size pdf num page); "%Bates", (fun () -> @@ -199,10 +221,13 @@ let replace_pairs pdf endpage extract_text_font_size filename bates batespad num else implode (many '0' (w - String.length numstring)) ^ numstring))] let expand_lines text time pdf endpage extract_text_font_size filename bates batespad num page lines = + let refnums = Pdf.page_reference_numbers pdf in + let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in + let marks = Pdfmarks.read_bookmarks pdf in let expanded_lines = map (function text -> - process_text time text (replace_pairs pdf endpage extract_text_font_size filename bates batespad num page)) + process_text time text (replace_pairs marks fastrefnums pdf endpage extract_text_font_size filename bates batespad num page)) lines in (* process URLs for justification too *) @@ -291,7 +316,10 @@ let addtext (indx0 (fst fontpack)) in let ops, urls, x, y, hoffset, voffset, text, joffset = - let text = process_text time text (replace_pairs pdf endpage extract_text_font_size filename bates batespad num page) in + let refnums = Pdf.page_reference_numbers pdf in + let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in + let marks = Pdfmarks.read_bookmarks pdf in + let text = process_text time text (replace_pairs marks fastrefnums pdf endpage extract_text_font_size filename bates batespad num page) in let text, urls = get_urls_line text in let lines = map (fun text -> if raw || fontpack <> None then text else charcodes_of_utf8 (Pdftext.read_font pdf fontpdfobj) text) lines in let expanded_lines = expand_lines text time pdf endpage extract_text_font_size filename bates batespad num page lines in diff --git a/cpdfaddtext.mli b/cpdfaddtext.mli index 5388529..8992d50 100644 --- a/cpdfaddtext.mli +++ b/cpdfaddtext.mli @@ -61,6 +61,8 @@ val addrectangle : (**/**) val replace_pairs : + Pdfmarks.t list -> + (int, int) Hashtbl.t -> Pdf.t -> int -> float option -> diff --git a/cpdfdraw.ml b/cpdfdraw.ml index b39495f..cb28844 100644 --- a/cpdfdraw.ml +++ b/cpdfdraw.ml @@ -164,8 +164,11 @@ let reset_state () = (res ()).page_names <- []*) let process_specials pdf endpage filename bates batespad num page s = + let refnums = Pdf.page_reference_numbers pdf in + let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in + let marks = Pdfmarks.read_bookmarks pdf in let pairs = - Cpdfaddtext.replace_pairs pdf endpage None filename bates batespad num page + Cpdfaddtext.replace_pairs marks fastrefnums pdf endpage None filename bates batespad num page in Cpdfaddtext.process_text (res ()).time s pairs