Scaffolding for extended -obj and friends

This commit is contained in:
John Whitington 2024-06-24 15:29:32 +01:00
parent 0a774c9a88
commit 0244a8d09b
2 changed files with 25 additions and 17 deletions

View File

@ -8,7 +8,13 @@ o Verify compliance to PDF/UA via the Matterhorn protocol
o Extract, edit and reapply document structure tree o Extract, edit and reapply document structure tree
o Split structure tree when splitting PDF to save size o Split structure tree when splitting PDF to save size
o Combine structure trees when stamping PDFs o Combine structure trees when stamping PDFs
o Report natural language on -info o Set the natural language of a file
Extended features:
o Allow -obj/-extract-stream to lookup nested PDF information
o Merge structure trees better when merging files
o Report top-level natural language on -info
o Report mark information dictionary contents on -info o Report mark information dictionary contents on -info
2.7 (February 2024) 2.7 (February 2024)

View File

@ -24,7 +24,6 @@ let initial_file_size = ref 0
let empty = Pdf.empty () let empty = Pdf.empty ()
(* Wrap up the file reading functions to exit with code 1 when an encryption (* Wrap up the file reading functions to exit with code 1 when an encryption
problem occurs. This happens when object streams are in an encrypted document problem occurs. This happens when object streams are in an encrypted document
and so it can't be read without the right password... The existing error and so it can't be read without the right password... The existing error
@ -217,8 +216,8 @@ type op =
| Chop of int * int | Chop of int * int
| ChopHV of bool * float | ChopHV of bool * float
| ProcessImages | ProcessImages
| ExtractStream of int | ExtractStream of string
| PrintObj of int | PrintObj of string
| Verify of string | Verify of string
| MarkAs of string | MarkAs of string
| RemoveMark of string | RemoveMark of string
@ -1841,15 +1840,15 @@ let setjbig2_lossy_threshold f =
let setprocessimagesinfo () = let setprocessimagesinfo () =
set Cpdfimage.debug_image_processing set Cpdfimage.debug_image_processing
let setextractstream i = let setextractstream s =
args.op <- Some (ExtractStream i) args.op <- Some (ExtractStream s)
let setextractstreamdecomp i = let setextractstreamdecomp s =
args.op <- Some (ExtractStream i); args.op <- Some (ExtractStream s);
args.extract_stream_decompress <- true args.extract_stream_decompress <- true
let setprintobj i = let setprintobj s =
args.op <- Some (PrintObj i) args.op <- Some (PrintObj s)
(* Parse a control file, make an argv, and then make Arg parse it. *) (* Parse a control file, make an argv, and then make Arg parse it. *)
let rec make_control_argv_and_parse filename = let rec make_control_argv_and_parse filename =
@ -2815,9 +2814,9 @@ and specs =
("-rise", Arg.Float (fun f -> Cpdfdrawcontrol.addop (Cpdfdraw.Rise f)), " Set text rise"); ("-rise", Arg.Float (fun f -> Cpdfdrawcontrol.addop (Cpdfdraw.Rise f)), " Set text rise");
("-nl", Arg.Unit (fun () -> Cpdfdrawcontrol.addop Cpdfdraw.Newline), " New line"); ("-nl", Arg.Unit (fun () -> Cpdfdrawcontrol.addop Cpdfdraw.Newline), " New line");
("-newpage", Arg.Unit Cpdfdrawcontrol.addnewpage, " Move to a fresh page"); ("-newpage", Arg.Unit Cpdfdrawcontrol.addnewpage, " Move to a fresh page");
("-extract-stream", Arg.Int setextractstream, " Extract a stream"); ("-extract-stream", Arg.String setextractstream, " Extract a stream");
("-extract-stream-decompress", Arg.Int setextractstreamdecomp, "Extract a stream, decompressing"); ("-extract-stream-decompress", Arg.String setextractstreamdecomp, "Extract a stream, decompressing");
("-obj", Arg.Int setprintobj, "Print object"); ("-obj", Arg.String setprintobj, "Print object");
("-json", Arg.Unit (fun () -> args.format_json <- true), "Format output as JSON"); ("-json", Arg.Unit (fun () -> args.format_json <- true), "Format output as JSON");
("-verify", Arg.String (fun s -> setop (Verify s) ()), "Verify conformance to a standard"); ("-verify", Arg.String (fun s -> setop (Verify s) ()), "Verify conformance to a standard");
("-mark-as", Arg.String (fun s -> setop (MarkAs s) ()), "Mark as conforming to a standard"); ("-mark-as", Arg.String (fun s -> setop (MarkAs s) ()), "Mark as conforming to a standard");
@ -3413,6 +3412,9 @@ let build_enc () =
Pdfwrite.user_password = args.user; Pdfwrite.user_password = args.user;
Pdfwrite.permissions = banlist_of_args ()} Pdfwrite.permissions = banlist_of_args ()}
let objnum_of_objspec s =
int_of_string s
let extract_stream pdf decomp objnum = let extract_stream pdf decomp objnum =
let obj = Pdf.lookup_obj pdf objnum in let obj = Pdf.lookup_obj pdf objnum in
Pdf.getstream obj; Pdf.getstream obj;
@ -4454,12 +4456,12 @@ let go () =
~dpi_threshold:args.dpi_threshold ~factor:args.resample_factor ~interpolate:args.resample_interpolate ~dpi_threshold:args.dpi_threshold ~factor:args.resample_factor ~interpolate:args.resample_interpolate
~path_to_jbig2enc:args.path_to_jbig2enc ~path_to_convert:args.path_to_im range pdf; ~path_to_jbig2enc:args.path_to_jbig2enc ~path_to_convert:args.path_to_im range pdf;
write_pdf false pdf write_pdf false pdf
| Some (ExtractStream i) -> | Some (ExtractStream s) ->
let pdf = get_single_pdf args.op true in let pdf = get_single_pdf args.op true in
extract_stream pdf args.extract_stream_decompress i extract_stream pdf args.extract_stream_decompress (objnum_of_objspec s)
| Some (PrintObj i) -> | Some (PrintObj s) ->
let pdf = get_single_pdf args.op true in let pdf = get_single_pdf args.op true in
print_obj pdf i print_obj pdf (objnum_of_objspec s)
| Some (Verify standard) -> | Some (Verify standard) ->
begin match standard with begin match standard with
| "PDF/UA-1(matterhorn)" -> | "PDF/UA-1(matterhorn)" ->