Fixing up JSON text output

This commit is contained in:
John Whitington 2021-12-30 15:25:24 +00:00
parent 618da13a80
commit dd2f8fd161
8 changed files with 44 additions and 22 deletions

View File

@ -26,7 +26,7 @@ let annotations_json_page pdf page pagenum =
| Some (Pdf.Array annots) -> | Some (Pdf.Array annots) ->
map map
(fun annot -> (fun annot ->
`List [`Int pagenum; Cpdfjson.json_of_object pdf (fun _ -> ()) false false annot]) `List [`Int pagenum; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false annot])
(map (Pdf.direct pdf) annots) (map (Pdf.direct pdf) annots)
| _ -> [] | _ -> []

View File

@ -184,7 +184,7 @@ let output_json_marks ch calculate_page_number pdf fastrefnums marks =
let json_of_mark m = let json_of_mark m =
`Assoc `Assoc
[("level", `Int m.Pdfmarks.level); [("level", `Int m.Pdfmarks.level);
("text", `String (Pdftext.utf8_of_pdfdocstring m.Pdfmarks.text)); ("text", `String (Pdftext.utf8_of_pdfdocstring (Pdftext.simplify_utf16be m.Pdfmarks.text)));
("page", `Int (calculate_page_number m)); ("page", `Int (calculate_page_number m));
("open", `Bool m.Pdfmarks.isopen); ("open", `Bool m.Pdfmarks.isopen);
("target", json_of_target pdf fastrefnums m.Pdfmarks.target)] ("target", json_of_target pdf fastrefnums m.Pdfmarks.target)]

View File

@ -462,6 +462,7 @@ type args =
mutable jsonparsecontentstreams : bool; mutable jsonparsecontentstreams : bool;
mutable jsonnostreamdata : bool; mutable jsonnostreamdata : bool;
mutable jsondecompressstreams : bool; mutable jsondecompressstreams : bool;
mutable jsoncleanstrings : bool;
mutable ocgrenamefrom : string; mutable ocgrenamefrom : string;
mutable ocgrenameto : string; mutable ocgrenameto : string;
mutable dedup : bool; mutable dedup : bool;
@ -581,6 +582,7 @@ let args =
jsonparsecontentstreams = false; jsonparsecontentstreams = false;
jsonnostreamdata = false; jsonnostreamdata = false;
jsondecompressstreams = false; jsondecompressstreams = false;
jsoncleanstrings = false;
ocgrenamefrom = ""; ocgrenamefrom = "";
ocgrenameto = ""; ocgrenameto = "";
dedup = false; dedup = false;
@ -685,6 +687,7 @@ let reset_arguments () =
args.jsonparsecontentstreams <- false; args.jsonparsecontentstreams <- false;
args.jsonnostreamdata <- false; args.jsonnostreamdata <- false;
args.jsondecompressstreams <- false; args.jsondecompressstreams <- false;
args.jsoncleanstrings <- false;
args.ocgrenamefrom <- ""; args.ocgrenamefrom <- "";
args.ocgrenameto <- ""; args.ocgrenameto <- "";
args.dedup <- false; args.dedup <- false;
@ -1614,6 +1617,9 @@ let setjsonnostreamdata () =
let setjsondecompressstreams () = let setjsondecompressstreams () =
args.jsondecompressstreams <- true args.jsondecompressstreams <- true
let setjsoncleanstrings () =
args.jsoncleanstrings <- true
let setocgrenamefrom s = let setocgrenamefrom s =
args.ocgrenamefrom <- s args.ocgrenamefrom <- s
@ -2426,6 +2432,9 @@ and specs =
("-output-json-decompress-streams", ("-output-json-decompress-streams",
Arg.Unit setjsondecompressstreams, Arg.Unit setjsondecompressstreams,
" Skip stream data for brevity"); " Skip stream data for brevity");
("-output-json-clean-strings",
Arg.Unit setjsoncleanstrings,
" Convert UTF16BE strings to PDFDocEncoding when possible");
("-j", ("-j",
Arg.String set_json_input, Arg.String set_json_input,
" Load a PDF JSON file"); " Load a PDF JSON file");
@ -2959,6 +2968,7 @@ let write_json output pdf =
~parse_content:args.jsonparsecontentstreams ~parse_content:args.jsonparsecontentstreams
~no_stream_data:args.jsonnostreamdata ~no_stream_data:args.jsonnostreamdata
~decompress_streams:args.jsondecompressstreams ~decompress_streams:args.jsondecompressstreams
~clean_strings:args.jsoncleanstrings
pdf pdf
| File filename -> | File filename ->
let f = open_out filename in let f = open_out filename in
@ -2967,6 +2977,7 @@ let write_json output pdf =
~parse_content:args.jsonparsecontentstreams ~parse_content:args.jsonparsecontentstreams
~no_stream_data:args.jsonnostreamdata ~no_stream_data:args.jsonnostreamdata
~decompress_streams:args.jsondecompressstreams ~decompress_streams:args.jsondecompressstreams
~clean_strings:args.jsoncleanstrings
pdf; pdf;
close_out f close_out f

View File

@ -23,11 +23,12 @@ Objects 1..n: The PDF's objects.
o Names are written as {"N": "/Pages"} o Names are written as {"N": "/Pages"}
o Indirect references are integers o Indirect references are integers
o Streams are {"S": [dict, data]} o Streams are {"S": [dict, data]}
o Strings are converted from UTF16BE/PDFDocEncoding to UTF8 before being
o Strings are converted from PDFDocEncoding to UTF8 before being encoded in encoded in JSON. When they are read back the process is JSON encoded --> UTF8
JSON. When they are read back the process is JSON encoded --> UTF8 --> --> UTF16BE/PDFDocEncoding. This process is fully reversible: it is to allow
PDFDocEncoding. This process is to allow easier editing of strings. This easier editing of strings. This does not happen to strings within text
does not happen to strings within text operators in parsed content streams. operators in parsed content streams, nor to /ID values in the
trailerdictionary, since neither is UTF16BE/PdfDocEncoding to begin with.
There are two subformats: parsing content streams or not. Hello World in CPDF There are two subformats: parsing content streams or not. Hello World in CPDF
JSON without parsing content streams: JSON without parsing content streams:
@ -81,7 +82,7 @@ When parsing content streams:
CPDF currently never preserves object streams, and only outputs unencrypted files. CPDF currently never preserves object streams, and only outputs unencrypted files.
When reloading a JSON file, CPDF knows how to correct /Length entries in When reloading a JSON file, CPDF knows how to correct or add /Length entries in
streams, so you need not worry about them. *) streams, so you need not worry about them. *)
open Pdfutil open Pdfutil
@ -277,12 +278,12 @@ let mkfloat f = `Assoc [("F", `Float f)]
let mkint i = `Assoc [("I", `Int i)] let mkint i = `Assoc [("I", `Int i)]
let mkname n = `Assoc [("N", `String n)] let mkname n = `Assoc [("N", `String n)]
let rec json_of_object pdf fcs no_stream_data pcs = function let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = function
| P.Null -> `Null | P.Null -> `Null
| P.Boolean b -> `Bool b | P.Boolean b -> `Bool b
| P.Integer i -> mkint i | P.Integer i -> mkint i
| P.Real r -> mkfloat r | P.Real r -> mkfloat r
| P.String s -> `String s | P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else s)
| P.Name n -> mkname n | P.Name n -> mkname n
| P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data pcs) objs) | P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data pcs) objs)
| P.Dictionary elts -> | P.Dictionary elts ->
@ -453,9 +454,8 @@ let precombine_page_content pdf =
in in
Pdfpage.change_pages true pdf pages' Pdfpage.change_pages true pdf pages'
(* FIXME make this optional? And maybe move into actual JSON reader, instead of (* Convert any strings in UTF16BE which could actually be in PDFDocEncoding
preprocessing PDF, so it helps us when writing, say, the output of (due to having no high bytes) to make editing JSON easier. *)
-print-dict-entry? *)
let rec ppstring_single_object pdf = function let rec ppstring_single_object pdf = function
| Pdf.Dictionary d -> Pdf.recurse_dict (ppstring_single_object pdf) d | Pdf.Dictionary d -> Pdf.recurse_dict (ppstring_single_object pdf) d
| (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) -> | (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) ->
@ -464,16 +464,16 @@ let rec ppstring_single_object pdf = function
| Pdf.String s -> Pdf.String (Pdftext.simplify_utf16be s) | Pdf.String s -> Pdf.String (Pdftext.simplify_utf16be s)
| x -> x | x -> x
(* Do all objects, but skip the trailer dictionary since may mess up /ID if it
happens to begin with UTF16BE BOM *)
let preprocess_strings pdf = let preprocess_strings pdf =
Pdf.objselfmap (ppstring_single_object pdf) pdf Pdf.objselfmap (ppstring_single_object pdf) pdf
(* Skip the trailer dictionary since may mess up /ID if it happens to begin with UTF16BE BOM *)
(*pdf.Pdf.trailerdict <- ppstring_single_object pdf pdf.Pdf.trailerdict*)
let json_of_pdf let json_of_pdf
~parse_content ~no_stream_data ~decompress_streams ~parse_content ~no_stream_data ~decompress_streams ~clean_strings
pdf pdf
= =
preprocess_strings pdf; if clean_strings then preprocess_strings pdf;
let pdf = if parse_content then precombine_page_content pdf else pdf in let pdf = if parse_content then precombine_page_content pdf else pdf in
if decompress_streams then if decompress_streams then
Pdf.objiter (fun _ obj -> Pdfcodec.decode_pdfstream_until_unknown pdf obj) pdf; Pdf.objiter (fun _ obj -> Pdfcodec.decode_pdfstream_until_unknown pdf obj) pdf;
@ -532,8 +532,8 @@ let json_of_pdf
(fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj]) (fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj])
pairs_parsed) pairs_parsed)
let to_output o ~parse_content ~no_stream_data ~decompress_streams pdf = let to_output o ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf =
let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams pdf in let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in
match o.Pdfio.out_caml_channel with match o.Pdfio.out_caml_channel with
| Some ch -> J.pretty_to_channel ch json | Some ch -> J.pretty_to_channel ch json
| None -> o.Pdfio.output_string (J.pretty_to_string json) | None -> o.Pdfio.output_string (J.pretty_to_string json)

View File

@ -1,4 +1,4 @@
val json_of_object : Pdf.t -> (int -> unit) -> bool -> bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t val json_of_object : ?clean_strings:bool -> Pdf.t -> (int -> unit) -> bool -> bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t
val object_of_json : Cpdfyojson.Safe.t -> Pdf.pdfobject val object_of_json : Cpdfyojson.Safe.t -> Pdf.pdfobject
val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> Pdf.t -> unit val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit
val of_input : Pdfio.input -> Pdf.t val of_input : Pdfio.input -> Pdf.t

Binary file not shown.

View File

@ -1558,6 +1558,9 @@ There are two options which turn off parts of the squeezer. They are \texttt{-sq
\vspace{1.5mm} \vspace{1.5mm}
\small\noindent\verb!cpdf -add-bookmarks <bookmark file> in.pdf -o out.pdf! \small\noindent\verb!cpdf -add-bookmarks <bookmark file> in.pdf -o out.pdf!
\vspace{1.5mm}
\small\noindent\verb!cpdf -add-bookmarks-json <bookmark file> in.pdf -o out.pdf!
\vspace{1.5mm} \vspace{1.5mm}
\small\noindent\verb!cpdf -bookmarks-open-to-level <n> in.pdf -o out.pdf! \small\noindent\verb!cpdf -bookmarks-open-to-level <n> in.pdf -o out.pdf!
@ -1666,6 +1669,14 @@ will be given if the bookmarks file is not in the correct form (in particular,
the numbers in the first column which specify the level must form a proper the numbers in the first column which specify the level must form a proper
tree with no entry being more than one greater than the last). tree with no entry being more than one greater than the last).
Bookmarks in JSON format (see above) may be added with \texttt{-add-bookmarks-json}:
\begin{framed}
\small\verb!cpdf -add-bookmarks-json bookmarks.json in.pdf -o out.pdf!
\end{framed}
Remember that strings in JSON bookmark files are in UTF8, rather than as native PDF strings.
\section{Opening bookmarks} \section{Opening bookmarks}
\index{bookmarks!opening at level} \index{bookmarks!opening at level}
As an alternative to extracting a bookmark file and manipulating the open-status of bookmarks, mass manipulation may be achieved by the following operation: As an alternative to extracting a bookmark file and manipulating the open-status of bookmarks, mass manipulation may be achieved by the following operation:

View File

@ -274,7 +274,7 @@ let print_dict_entry pdf key =
match Pdf.lookup_direct pdf key d with match Pdf.lookup_direct pdf key d with
| Some v -> | Some v ->
(* We use a double newline as a separator. *) (* We use a double newline as a separator. *)
Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object pdf (fun _ -> ()) false false v)); Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false v));
d d
| None -> d | None -> d
in in