This commit is contained in:
John Whitington 2021-10-12 15:35:08 +01:00
parent a6ed214b21
commit ece1e9f280
3 changed files with 24 additions and 14 deletions

View File

@ -1,3 +1,4 @@
(* FIXME add -output-json-precombine-contents *)
(* cpdf command line tools *) (* cpdf command line tools *)
let demo = false let demo = false
let noncomp = false let noncomp = false
@ -3406,17 +3407,19 @@ let write_json output pdf =
| Stdout -> | Stdout ->
Cpdfjson.to_output Cpdfjson.to_output
(Pdfio.output_of_channel stdout) (Pdfio.output_of_channel stdout)
args.jsonparsecontentstreams ~parse_content:args.jsonparsecontentstreams
args.jsonnostreamdata ~no_stream_data:args.jsonnostreamdata
args.jsondecompressstreams ~decompress_streams:args.jsondecompressstreams
~precombine_page_content:false (* FIXME add arg *)
pdf pdf
| File filename -> | File filename ->
let f = open_out filename in let f = open_out filename in
Cpdfjson.to_output Cpdfjson.to_output
(Pdfio.output_of_channel f) (Pdfio.output_of_channel f)
args.jsonparsecontentstreams ~parse_content:args.jsonparsecontentstreams
args.jsonnostreamdata ~no_stream_data:args.jsonnostreamdata
args.jsondecompressstreams ~decompress_streams:args.jsondecompressstreams
~precombine_page_content:false (* FIXME add arg *)
pdf; pdf;
close_out f close_out f

View File

@ -333,9 +333,10 @@ let parse_content_stream pdf resources bs =
`List (map (json_of_op pdf false) ops) `List (map (json_of_op pdf false) ops)
(* We need to make sure each page only has one page content stream. Otherwise, (* We need to make sure each page only has one page content stream. Otherwise,
if not split on op boundaries, each one would fail to parse on its own. *) if not split on op boundaries, each one would fail to parse on its own. The
(* Future improvement. Don't blow up shared content streams. *) caller should really only do this on otherwise-failing files, since it could
let precombine_page_content pdf = blow up any shared content streams *)
let do_precombine_page_content pdf =
let pages' = let pages' =
map map
(fun page -> (fun page ->
@ -351,8 +352,14 @@ let precombine_page_content pdf =
in in
Pdfpage.change_pages true pdf pages' Pdfpage.change_pages true pdf pages'
let json_of_pdf parse_content no_stream_data decompress_streams pdf = let json_of_pdf
let pdf = if parse_content then precombine_page_content pdf else pdf in ~parse_content
~no_stream_data
~decompress_streams
~precombine_page_content
pdf
=
let pdf = if parse_content && precombine_page_content then do_precombine_page_content pdf else pdf in
if decompress_streams then if decompress_streams then
Pdf.objiter (fun _ obj -> Pdfcodec.decode_pdfstream_until_unknown pdf obj) pdf; Pdf.objiter (fun _ obj -> Pdfcodec.decode_pdfstream_until_unknown pdf obj) pdf;
Pdf.remove_unreferenced pdf; Pdf.remove_unreferenced pdf;
@ -405,8 +412,8 @@ let json_of_pdf parse_content no_stream_data decompress_streams pdf =
pairs_parsed) pairs_parsed)
(* FIXME Proper streaming to output *) (* FIXME Proper streaming to output *)
let to_output o parse_content no_stream_data decompress_streams pdf = let to_output o ~parse_content ~no_stream_data ~decompress_streams ~precombine_page_content pdf =
let json = json_of_pdf parse_content no_stream_data decompress_streams pdf in let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~precombine_page_content pdf in
o.Pdfio.output_string (J.pretty_to_string json) o.Pdfio.output_string (J.pretty_to_string json)
(* FIXME Proper streaming to output / from input, rather than making a big string first. *) (* FIXME Proper streaming to output / from input, rather than making a big string first. *)

View File

@ -1,2 +1,2 @@
val to_output : Pdfio.output -> bool -> bool -> bool -> Pdf.t -> unit val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> precombine_page_content:bool -> Pdf.t -> unit
val of_input : Pdfio.input -> Pdf.t val of_input : Pdfio.input -> Pdf.t