Added -output-json-no-stream-data

This commit is contained in:
John Whitington 2020-02-01 10:18:15 +00:00
parent 655d7ce069
commit eec4c9b5a1
3 changed files with 33 additions and 23 deletions

View File

@ -436,7 +436,8 @@ type args =
mutable createpdf_pages : int; mutable createpdf_pages : int;
mutable createpdf_pagesize : Pdfpaper.t; mutable createpdf_pagesize : Pdfpaper.t;
mutable removeonly : string option; mutable removeonly : string option;
mutable jsonparsecontentstreams : bool} mutable jsonparsecontentstreams : bool;
mutable jsonnostreamdata : bool}
let args = let args =
{op = None; {op = None;
@ -533,7 +534,8 @@ let args =
createpdf_pages = 1; createpdf_pages = 1;
createpdf_pagesize = Pdfpaper.a4; createpdf_pagesize = Pdfpaper.a4;
removeonly = None; removeonly = None;
jsonparsecontentstreams = false} jsonparsecontentstreams = false;
jsonnostreamdata = false}
let reset_arguments () = let reset_arguments () =
args.op <- None; args.op <- None;
@ -619,7 +621,8 @@ let reset_arguments () =
args.createpdf_pages <- 1; args.createpdf_pages <- 1;
args.createpdf_pagesize <- Pdfpaper.a4; args.createpdf_pagesize <- Pdfpaper.a4;
args.removeonly <- None; args.removeonly <- None;
args.jsonparsecontentstreams <- false args.jsonparsecontentstreams <- false;
args.jsonnostreamdata <- false
(* Do not reset original_filename or cpdflin or was_encrypted or (* Do not reset original_filename or cpdflin or was_encrypted or
* was_decrypted_with_owner or recrypt or producer or creator or * was_decrypted_with_owner or recrypt or producer or creator or
* path_to_ghostscript or gs_malformed or gs_quiet, since we want these to work across * path_to_ghostscript or gs_malformed or gs_quiet, since we want these to work across
@ -1458,6 +1461,9 @@ let setgsquiet () =
let setjsonparsecontentstreams () = let setjsonparsecontentstreams () =
args.jsonparsecontentstreams <- true args.jsonparsecontentstreams <- true
let setjsonnostreamdata () =
args.jsonnostreamdata <- true
let whingemalformed () = let whingemalformed () =
prerr_string "Command line must be of exactly the form\ncpdf <infile> -gs <path> -gs-malformed-force -o <outfile>\n"; prerr_string "Command line must be of exactly the form\ncpdf <infile> -gs <path> -gs-malformed-force -o <outfile>\n";
exit 1 exit 1
@ -2092,6 +2098,7 @@ and specs =
(* These items are undocumented *) (* These items are undocumented *)
("-output-json", Arg.Unit (setop OutputJSON), ""); ("-output-json", Arg.Unit (setop OutputJSON), "");
("-output-json-parse-content-streams", Arg.Unit setjsonparsecontentstreams, ""); ("-output-json-parse-content-streams", Arg.Unit setjsonparsecontentstreams, "");
("-output-json-no-stream-data", Arg.Unit setjsonnostreamdata, "");
("-remove-unused-resources", Arg.Unit (setop RemoveUnusedResources), ""); ("-remove-unused-resources", Arg.Unit (setop RemoveUnusedResources), "");
("-stay-on-error", Arg.Unit setstayonerror, ""); ("-stay-on-error", Arg.Unit setstayonerror, "");
("-extract-fontfile", Arg.Unit (setop ExtractFontFile), ""); ("-extract-fontfile", Arg.Unit (setop ExtractFontFile), "");
@ -3550,10 +3557,10 @@ let write_json output pdf =
| NoOutputSpecified -> | NoOutputSpecified ->
error "-output-json: no output name specified" error "-output-json: no output name specified"
| Stdout -> | Stdout ->
CpdfwriteJSON.write stdout args.jsonparsecontentstreams pdf CpdfwriteJSON.write stdout args.jsonparsecontentstreams args.jsonnostreamdata pdf
| File filename -> | File filename ->
let f = open_out_bin filename in let f = open_out_bin filename in
CpdfwriteJSON.write f args.jsonparsecontentstreams pdf; CpdfwriteJSON.write f args.jsonparsecontentstreams args.jsonnostreamdata pdf;
close_out f close_out f
(* Main function *) (* Main function *)

View File

@ -4,14 +4,14 @@ module O = Pdfops
(* FIXME jsonlint doesn't like tiny_json's 0., 1. etc. *) (* FIXME jsonlint doesn't like tiny_json's 0., 1. etc. *)
let rec json_of_object fcs = function let rec json_of_object fcs no_stream_data = function
| P.Null -> J.String "null" | P.Null -> J.String "null"
| P.Boolean b -> J.Bool b | P.Boolean b -> J.Bool b
| P.Integer i -> J.Number (string_of_int i) | P.Integer i -> J.Number (string_of_int i)
| P.Real r -> J.Number (string_of_float r) | P.Real r -> J.Number (string_of_float r)
| P.String s -> J.String s | P.String s -> J.String s
| P.Name n -> J.String n | P.Name n -> J.String n
| P.Array objs -> J.Array (List.map (json_of_object fcs) objs) | P.Array objs -> J.Array (List.map (json_of_object fcs no_stream_data) objs)
| P.Dictionary elts -> | P.Dictionary elts ->
List.iter List.iter
(function (function
@ -19,18 +19,21 @@ let rec json_of_object fcs = function
| ("/Contents", P.Array elts) -> List.iter (function P.Indirect i -> fcs i | _ -> ()) elts | ("/Contents", P.Array elts) -> List.iter (function P.Indirect i -> fcs i | _ -> ()) elts
| _ -> ()) | _ -> ())
elts; elts;
J.Object (List.map (fun (k, v) -> (k, json_of_object fcs v)) elts) J.Object (List.map (fun (k, v) -> (k, json_of_object fcs no_stream_data v)) elts)
| P.Stream {contents = (Pdf.Dictionary dict, stream)} as thestream -> | P.Stream {contents = (Pdf.Dictionary dict, stream)} as thestream ->
Pdf.getstream thestream; Pdf.getstream thestream;
let str = match stream with Got b -> Pdfio.string_of_bytes b | ToGet _ -> "failure: toget" in let str =
json_of_object fcs (P.Array [P.Dictionary dict; P.String str]) if no_stream_data then "<<stream data elided>>" else
match stream with Pdf.Got b -> Pdfio.string_of_bytes b | Pdf.ToGet _ -> "failure: toget"
in
json_of_object fcs no_stream_data (P.Array [P.Dictionary dict; P.String str])
| P.Stream _ -> J.String "error: stream with not-a-dictioary" | P.Stream _ -> J.String "error: stream with not-a-dictioary"
| P.Indirect i -> J.Number (string_of_int i) | P.Indirect i -> J.Number (string_of_int i)
let sof = string_of_float let sof = string_of_float
let soi = string_of_int let soi = string_of_int
let json_of_op = function let json_of_op no_stream_data = function
| O.Op_S -> J.Array [J.String "S"] | O.Op_S -> J.Array [J.String "S"]
| O.Op_s -> J.Array [J.String "s"] | O.Op_s -> J.Array [J.String "s"]
| O.Op_f -> J.Array [J.String "f"] | O.Op_f -> J.Array [J.String "f"]
@ -94,7 +97,7 @@ let json_of_op = function
J.Number (sof d); J.String "y"] J.Number (sof d); J.String "y"]
| O.Op_Tc c -> J.Array [J.Number (sof c); J.String "Tc"] | O.Op_Tc c -> J.Array [J.Number (sof c); J.String "Tc"]
| O.Op_Tw w -> J.Array [J.Number (sof w); J.String "Tw"] | O.Op_Tw w -> J.Array [J.Number (sof w); J.String "Tw"]
| O.Op_Tz z -> J.Array [J.Number (sof z); String "Tz"] | O.Op_Tz z -> J.Array [J.Number (sof z); J.String "Tz"]
| O.Op_TL l -> J.Array [J.Number (sof l); J.String "TL"] | O.Op_TL l -> J.Array [J.Number (sof l); J.String "TL"]
| O.Op_Tf (k, s) -> J.Array [J.String k; J.Number (sof s); J.String "Tf"] | O.Op_Tf (k, s) -> J.Array [J.String k; J.Number (sof s); J.String "Tf"]
| O.Op_Tr i -> J.Array [J.Number (soi i); J.String "Tr"] | O.Op_Tr i -> J.Array [J.Number (soi i); J.String "Tr"]
@ -111,7 +114,7 @@ let json_of_op = function
J.Number (sof t.Pdftransform.f); J.Number (sof t.Pdftransform.f);
J.String "Tm"] J.String "Tm"]
| O.Op_Tj s -> J.Array [J.String s; J.String "Tj"] | O.Op_Tj s -> J.Array [J.String s; J.String "Tj"]
| O.Op_TJ pdfobject -> J.Array [json_of_object (fun _ -> ()) pdfobject; J.String "TJ"] | O.Op_TJ pdfobject -> J.Array [json_of_object (fun _ -> ()) no_stream_data pdfobject; J.String "TJ"]
| O.Op_' s -> J.Array [J.String s; J.String "'"] | O.Op_' s -> J.Array [J.String s; J.String "'"]
| O.Op_'' (k, k', s) -> J.Array [J.Number (sof k); J.Number (sof k'); J.String s; J.String "''"] | O.Op_'' (k, k', s) -> J.Array [J.Number (sof k); J.Number (sof k'); J.String s; J.String "''"]
| O.Op_d0 (k, k') -> J.Array [J.Number (sof k); J.Number (sof k'); J.String "d0"] | O.Op_d0 (k, k') -> J.Array [J.Number (sof k); J.Number (sof k'); J.String "d0"]
@ -136,8 +139,8 @@ let json_of_op = function
J.Array (List.map (fun x -> J.Number (sof x)) fs @ [J.String s; J.String "SCNName"]) J.Array (List.map (fun x -> J.Number (sof x)) fs @ [J.String s; J.String "SCNName"])
| O.Op_scnName (s, fs) -> | O.Op_scnName (s, fs) ->
J.Array (List.map (fun x -> J.Number (sof x)) fs @ [J.String s; J.String "scnName"]) J.Array (List.map (fun x -> J.Number (sof x)) fs @ [J.String s; J.String "scnName"])
| O.InlineImage (dict, data) -> J.Array [json_of_object (fun _ -> ()) dict; J.String (Pdfio.string_of_bytes data)] | O.InlineImage (dict, data) -> J.Array [json_of_object (fun _ -> ()) no_stream_data dict; J.String (Pdfio.string_of_bytes data)]
| O.Op_DP (s, obj) -> J.Array [J.String s; json_of_object (fun _ -> ()) obj; J.String "DP"] | O.Op_DP (s, obj) -> J.Array [J.String s; json_of_object (fun _ -> ()) no_stream_data obj; J.String "DP"]
(* parse_stream needs pdf and resources. These are for lexing of inline images, (* parse_stream needs pdf and resources. These are for lexing of inline images,
* looking up the colourspace. We do not need to worry about inherited * looking up the colourspace. We do not need to worry about inherited
@ -145,17 +148,17 @@ let json_of_op = function
* PDF standard. *) * PDF standard. *)
let parse_content_stream pdf resources bs = let parse_content_stream pdf resources bs =
let ops = Pdfops.parse_stream pdf resources [bs] in let ops = Pdfops.parse_stream pdf resources [bs] in
J.Array (List.map json_of_op ops) J.Array (List.map (json_of_op false) ops)
let json_of_pdf parse_content pdf = let json_of_pdf parse_content no_stream_data pdf =
let trailerdict = (0, json_of_object (fun x -> ()) pdf.Pdf.trailerdict) in let trailerdict = (0, json_of_object (fun x -> ()) no_stream_data pdf.Pdf.trailerdict) in
let content_streams = ref [] in let content_streams = ref [] in
let fcs n = content_streams := n::!content_streams in let fcs n = content_streams := n::!content_streams in
let pairs = let pairs =
let ps = ref [] in let ps = ref [] in
Pdf.objiter Pdf.objiter
(fun i pdfobj -> (fun i pdfobj ->
ps := (i, json_of_object fcs pdfobj)::!ps) ps := (i, json_of_object fcs no_stream_data pdfobj)::!ps)
pdf; pdf;
trailerdict::!ps trailerdict::!ps
in in
@ -171,7 +174,7 @@ let json_of_pdf parse_content pdf =
(* FIXME Proper resources here for reasons explained above *) (* FIXME Proper resources here for reasons explained above *)
let streamdata = let streamdata =
match Pdf.lookup_obj pdf objnum with match Pdf.lookup_obj pdf objnum with
| Stream {contents = (_, Got b)} -> b | Pdf.Stream {contents = (_, Pdf.Got b)} -> b
| _ -> failwith "JSON: stream not decoded" | _ -> failwith "JSON: stream not decoded"
in in
(objnum, J.Array [dict; parse_content_stream pdf (Pdf.Dictionary []) streamdata]) (objnum, J.Array [dict; parse_content_stream pdf (Pdf.Dictionary []) streamdata])
@ -186,10 +189,10 @@ let json_of_pdf parse_content pdf =
(fun (objnum, jsonobj) -> J.Array [J.String (string_of_int objnum); jsonobj]) (fun (objnum, jsonobj) -> J.Array [J.String (string_of_int objnum); jsonobj])
pairs_parsed) pairs_parsed)
let write fh parse_content pdf = let write fh parse_content no_stream_data pdf =
let b = Buffer.create 256 in let b = Buffer.create 256 in
let formatter = Format.formatter_of_buffer b in let formatter = Format.formatter_of_buffer b in
Tjjson.format formatter (json_of_pdf parse_content pdf); Tjjson.format formatter (json_of_pdf parse_content no_stream_data pdf);
Format.pp_print_flush formatter (); Format.pp_print_flush formatter ();
output_string fh (Buffer.contents b) output_string fh (Buffer.contents b)

View File

@ -1 +1 @@
val write : out_channel -> bool -> Pdf.t -> unit val write : out_channel -> bool -> bool -> Pdf.t -> unit