From ece1e9f280c075faadf88846bf7dbe599274125f Mon Sep 17 00:00:00 2001 From: John Whitington Date: Tue, 12 Oct 2021 15:35:08 +0100 Subject: [PATCH] more --- cpdfcommand.ml | 15 +++++++++------ cpdfjson.ml | 21 ++++++++++++++------- cpdfjson.mli | 2 +- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 05e3471..a22287c 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -1,3 +1,4 @@ +(* FIXME add -output-json-precombine-contents *) (* cpdf command line tools *) let demo = false let noncomp = false @@ -3406,17 +3407,19 @@ let write_json output pdf = | Stdout -> Cpdfjson.to_output (Pdfio.output_of_channel stdout) - args.jsonparsecontentstreams - args.jsonnostreamdata - args.jsondecompressstreams + ~parse_content:args.jsonparsecontentstreams + ~no_stream_data:args.jsonnostreamdata + ~decompress_streams:args.jsondecompressstreams + ~precombine_page_content:false (* FIXME add arg *) pdf | File filename -> let f = open_out filename in Cpdfjson.to_output (Pdfio.output_of_channel f) - args.jsonparsecontentstreams - args.jsonnostreamdata - args.jsondecompressstreams + ~parse_content:args.jsonparsecontentstreams + ~no_stream_data:args.jsonnostreamdata + ~decompress_streams:args.jsondecompressstreams + ~precombine_page_content:false (* FIXME add arg *) pdf; close_out f diff --git a/cpdfjson.ml b/cpdfjson.ml index a343048..7589e98 100644 --- a/cpdfjson.ml +++ b/cpdfjson.ml @@ -333,9 +333,10 @@ let parse_content_stream pdf resources bs = `List (map (json_of_op pdf false) ops) (* We need to make sure each page only has one page content stream. Otherwise, - if not split on op boundaries, each one would fail to parse on its own. *) -(* Future improvement. Don't blow up shared content streams. *) -let precombine_page_content pdf = + if not split on op boundaries, each one would fail to parse on its own. The + caller should really only do this on otherwise-failing files, since it could + blow up any shared content streams *) +let do_precombine_page_content pdf = let pages' = map (fun page -> @@ -351,8 +352,14 @@ let precombine_page_content pdf = in Pdfpage.change_pages true pdf pages' -let json_of_pdf parse_content no_stream_data decompress_streams pdf = - let pdf = if parse_content then precombine_page_content pdf else pdf in +let json_of_pdf + ~parse_content + ~no_stream_data + ~decompress_streams + ~precombine_page_content + pdf += + let pdf = if parse_content && precombine_page_content then do_precombine_page_content pdf else pdf in if decompress_streams then Pdf.objiter (fun _ obj -> Pdfcodec.decode_pdfstream_until_unknown pdf obj) pdf; Pdf.remove_unreferenced pdf; @@ -405,8 +412,8 @@ let json_of_pdf parse_content no_stream_data decompress_streams pdf = pairs_parsed) (* FIXME Proper streaming to output *) -let to_output o parse_content no_stream_data decompress_streams pdf = - let json = json_of_pdf parse_content no_stream_data decompress_streams pdf in +let to_output o ~parse_content ~no_stream_data ~decompress_streams ~precombine_page_content pdf = + let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~precombine_page_content pdf in o.Pdfio.output_string (J.pretty_to_string json) (* FIXME Proper streaming to output / from input, rather than making a big string first. *) diff --git a/cpdfjson.mli b/cpdfjson.mli index 039b312..1e7b837 100644 --- a/cpdfjson.mli +++ b/cpdfjson.mli @@ -1,2 +1,2 @@ -val to_output : Pdfio.output -> bool -> bool -> bool -> Pdf.t -> unit +val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> precombine_page_content:bool -> Pdf.t -> unit val of_input : Pdfio.input -> Pdf.t