option to uncompress json streams

This commit is contained in:
John Whitington 2021-10-04 18:38:36 +01:00
parent 33e2a86f55
commit 14edc5a5db
3 changed files with 40 additions and 13 deletions

View File

@ -450,6 +450,7 @@ type args =
mutable removeonly : string option; mutable removeonly : string option;
mutable jsonparsecontentstreams : bool; mutable jsonparsecontentstreams : bool;
mutable jsonnostreamdata : bool; mutable jsonnostreamdata : bool;
mutable jsondecompressstreams : bool;
mutable ocgrenamefrom : string; mutable ocgrenamefrom : string;
mutable ocgrenameto : string; mutable ocgrenameto : string;
mutable dedup : bool; mutable dedup : bool;
@ -554,6 +555,7 @@ let args =
removeonly = None; removeonly = None;
jsonparsecontentstreams = false; jsonparsecontentstreams = false;
jsonnostreamdata = false; jsonnostreamdata = false;
jsondecompressstreams = false;
ocgrenamefrom = ""; ocgrenamefrom = "";
ocgrenameto = ""; ocgrenameto = "";
dedup = false; dedup = false;
@ -643,6 +645,7 @@ let reset_arguments () =
args.removeonly <- None; args.removeonly <- None;
args.jsonparsecontentstreams <- false; args.jsonparsecontentstreams <- false;
args.jsonnostreamdata <- false; args.jsonnostreamdata <- false;
args.jsondecompressstreams <- false;
args.ocgrenamefrom <- ""; args.ocgrenamefrom <- "";
args.ocgrenameto <- ""; args.ocgrenameto <- "";
args.dedup <- false; args.dedup <- false;
@ -1499,6 +1502,9 @@ let setjsonparsecontentstreams () =
let setjsonnostreamdata () = let setjsonnostreamdata () =
args.jsonnostreamdata <- true args.jsonnostreamdata <- true
let setjsondecompressstreams () =
args.jsondecompressstreams <- true
let setocgrenamefrom s = let setocgrenamefrom s =
args.ocgrenamefrom <- s args.ocgrenamefrom <- s
@ -2194,6 +2200,9 @@ and specs =
("-output-json-no-stream-data", ("-output-json-no-stream-data",
Arg.Unit setjsonnostreamdata, Arg.Unit setjsonnostreamdata,
" Skip stream data for brevity"); " Skip stream data for brevity");
("-output-json-decompress-streams",
Arg.Unit setjsondecompressstreams,
" Skip stream data for brevity");
("-j", ("-j",
Arg.String set_json_input, Arg.String set_json_input,
"Load a PDF JSON file"); "Load a PDF JSON file");
@ -3395,10 +3404,20 @@ let write_json output pdf =
| NoOutputSpecified -> | NoOutputSpecified ->
error "-output-json: no output name specified" error "-output-json: no output name specified"
| Stdout -> | Stdout ->
Cpdfjson.to_output (Pdfio.output_of_channel stdout) args.jsonparsecontentstreams args.jsonnostreamdata pdf Cpdfjson.to_output
(Pdfio.output_of_channel stdout)
args.jsonparsecontentstreams
args.jsonnostreamdata
args.jsondecompressstreams
pdf
| File filename -> | File filename ->
let f = open_out filename in let f = open_out filename in
Cpdfjson.to_output (Pdfio.output_of_channel f) args.jsonparsecontentstreams args.jsonnostreamdata pdf; Cpdfjson.to_output
(Pdfio.output_of_channel f)
args.jsonparsecontentstreams
args.jsonnostreamdata
args.jsondecompressstreams
pdf;
close_out f close_out f
(* Main function *) (* Main function *)

View File

@ -299,8 +299,16 @@ let precombine_page_content pdf =
in in
Pdfpage.change_pages true pdf pages' Pdfpage.change_pages true pdf pages'
let json_of_pdf parse_content no_stream_data pdf = let json_of_pdf parse_content no_stream_data decompress_streams pdf =
let pdf = if parse_content then precombine_page_content pdf else pdf in let pdf = if parse_content then precombine_page_content pdf else pdf in
if decompress_streams then
Pdf.objiter
(fun n obj ->
Printf.eprintf "obj %i\n" n;
match obj with
| Pdf.Stream _ -> Printf.eprintf "decompressing...\n"; Pdfcodec.decode_pdfstream_until_unknown pdf obj
| _ -> ())
pdf;
Pdf.remove_unreferenced pdf; Pdf.remove_unreferenced pdf;
let trailerdict = (0, json_of_object pdf (fun x -> ()) no_stream_data pdf.P.trailerdict) in let trailerdict = (0, json_of_object pdf (fun x -> ()) no_stream_data pdf.P.trailerdict) in
let parameters = let parameters =
@ -313,7 +321,10 @@ let json_of_pdf parse_content no_stream_data pdf =
])) ]))
in in
let content_streams = ref [] in let content_streams = ref [] in
let fcs n = content_streams := n::!content_streams in let fcs n =
content_streams := n::!content_streams;
if parse_content then Pdfcodec.decode_pdfstream_until_unknown pdf (P.lookup_obj pdf n)
in
let pairs = let pairs =
let ps = ref [] in let ps = ref [] in
P.objiter P.objiter
@ -322,8 +333,6 @@ let json_of_pdf parse_content no_stream_data pdf =
pdf; pdf;
parameters::trailerdict::!ps parameters::trailerdict::!ps
in in
if parse_content then
iter (fun n -> Pdfcodec.decode_pdfstream_until_unknown pdf (P.lookup_obj pdf n)) !content_streams;
let pairs_parsed = let pairs_parsed =
if not parse_content then pairs else if not parse_content then pairs else
map map
@ -350,8 +359,7 @@ let json_of_pdf parse_content no_stream_data pdf =
pairs_parsed) pairs_parsed)
let pdf_of_json json = let pdf_of_json json =
(*flprint (J.show json); (*flprint (J.show json); flprint "\n";*)
flprint "\n";*)
let objs = match json with J.Array objs -> objs | _ -> error "bad json top level" in let objs = match json with J.Array objs -> objs | _ -> error "bad json top level" in
let params = ref Pdf.Null in let params = ref Pdf.Null in
let trailerdict = ref Pdf.Null in let trailerdict = ref Pdf.Null in
@ -369,8 +377,7 @@ let pdf_of_json json =
| _ -> error "json bad obj") | _ -> error "json bad obj")
objs objs
in in
(*List. (*List. iter (fun (i, o) -> flprint (soi i); flprint "\n"; flprint (Pdfwrite.string_of_pdf o); flprint "\n") objects;*)
iter (fun (i, o) -> flprint (soi i); flprint "\n"; flprint (Pdfwrite.string_of_pdf o); flprint "\n") objects;*)
begin match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONstreamdataincluded" !params with begin match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONstreamdataincluded" !params with
| Some (Pdf.Boolean false) -> error "no stream data; cannot reconstruct PDF" | Some (Pdf.Boolean false) -> error "no stream data; cannot reconstruct PDF"
| _ -> () | _ -> ()
@ -408,12 +415,13 @@ let pdf_of_json json =
P.saved_encryption = None} P.saved_encryption = None}
(* FIXME Proper streaming to output / from input, rather than making a big string first. *) (* FIXME Proper streaming to output / from input, rather than making a big string first. *)
let to_output o parse_content no_stream_data pdf = let to_output o parse_content no_stream_data decompress_streams pdf =
let b = Buffer.create 256 in let b = Buffer.create 256 in
let formatter = Format.formatter_of_buffer b in let formatter = Format.formatter_of_buffer b in
J.format formatter (json_of_pdf parse_content no_stream_data pdf); J.format formatter (json_of_pdf parse_content no_stream_data decompress_streams pdf);
Format.pp_print_flush formatter (); Format.pp_print_flush formatter ();
o.Pdfio.output_string (Buffer.contents b) o.Pdfio.output_string (Buffer.contents b)
(* FIXME Proper streaming to output / from input, rather than making a big string first. *)
let of_input i = let of_input i =
pdf_of_json (J.parse (Pdfio.string_of_bytes (Pdfio.bytes_of_input i 0 (i.Pdfio.in_channel_length)))) pdf_of_json (J.parse (Pdfio.string_of_bytes (Pdfio.bytes_of_input i 0 (i.Pdfio.in_channel_length))))

View File

@ -1,2 +1,2 @@
val to_output : Pdfio.output -> bool -> bool -> Pdf.t -> unit val to_output : Pdfio.output -> bool -> bool -> bool -> Pdf.t -> unit
val of_input : Pdfio.input -> Pdf.t val of_input : Pdfio.input -> Pdf.t