Fixing up JSON text output
This commit is contained in:
parent
618da13a80
commit
dd2f8fd161
|
@ -26,7 +26,7 @@ let annotations_json_page pdf page pagenum =
|
||||||
| Some (Pdf.Array annots) ->
|
| Some (Pdf.Array annots) ->
|
||||||
map
|
map
|
||||||
(fun annot ->
|
(fun annot ->
|
||||||
`List [`Int pagenum; Cpdfjson.json_of_object pdf (fun _ -> ()) false false annot])
|
`List [`Int pagenum; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false annot])
|
||||||
(map (Pdf.direct pdf) annots)
|
(map (Pdf.direct pdf) annots)
|
||||||
| _ -> []
|
| _ -> []
|
||||||
|
|
||||||
|
|
|
@ -184,7 +184,7 @@ let output_json_marks ch calculate_page_number pdf fastrefnums marks =
|
||||||
let json_of_mark m =
|
let json_of_mark m =
|
||||||
`Assoc
|
`Assoc
|
||||||
[("level", `Int m.Pdfmarks.level);
|
[("level", `Int m.Pdfmarks.level);
|
||||||
("text", `String (Pdftext.utf8_of_pdfdocstring m.Pdfmarks.text));
|
("text", `String (Pdftext.utf8_of_pdfdocstring (Pdftext.simplify_utf16be m.Pdfmarks.text)));
|
||||||
("page", `Int (calculate_page_number m));
|
("page", `Int (calculate_page_number m));
|
||||||
("open", `Bool m.Pdfmarks.isopen);
|
("open", `Bool m.Pdfmarks.isopen);
|
||||||
("target", json_of_target pdf fastrefnums m.Pdfmarks.target)]
|
("target", json_of_target pdf fastrefnums m.Pdfmarks.target)]
|
||||||
|
|
|
@ -462,6 +462,7 @@ type args =
|
||||||
mutable jsonparsecontentstreams : bool;
|
mutable jsonparsecontentstreams : bool;
|
||||||
mutable jsonnostreamdata : bool;
|
mutable jsonnostreamdata : bool;
|
||||||
mutable jsondecompressstreams : bool;
|
mutable jsondecompressstreams : bool;
|
||||||
|
mutable jsoncleanstrings : bool;
|
||||||
mutable ocgrenamefrom : string;
|
mutable ocgrenamefrom : string;
|
||||||
mutable ocgrenameto : string;
|
mutable ocgrenameto : string;
|
||||||
mutable dedup : bool;
|
mutable dedup : bool;
|
||||||
|
@ -581,6 +582,7 @@ let args =
|
||||||
jsonparsecontentstreams = false;
|
jsonparsecontentstreams = false;
|
||||||
jsonnostreamdata = false;
|
jsonnostreamdata = false;
|
||||||
jsondecompressstreams = false;
|
jsondecompressstreams = false;
|
||||||
|
jsoncleanstrings = false;
|
||||||
ocgrenamefrom = "";
|
ocgrenamefrom = "";
|
||||||
ocgrenameto = "";
|
ocgrenameto = "";
|
||||||
dedup = false;
|
dedup = false;
|
||||||
|
@ -685,6 +687,7 @@ let reset_arguments () =
|
||||||
args.jsonparsecontentstreams <- false;
|
args.jsonparsecontentstreams <- false;
|
||||||
args.jsonnostreamdata <- false;
|
args.jsonnostreamdata <- false;
|
||||||
args.jsondecompressstreams <- false;
|
args.jsondecompressstreams <- false;
|
||||||
|
args.jsoncleanstrings <- false;
|
||||||
args.ocgrenamefrom <- "";
|
args.ocgrenamefrom <- "";
|
||||||
args.ocgrenameto <- "";
|
args.ocgrenameto <- "";
|
||||||
args.dedup <- false;
|
args.dedup <- false;
|
||||||
|
@ -1614,6 +1617,9 @@ let setjsonnostreamdata () =
|
||||||
let setjsondecompressstreams () =
|
let setjsondecompressstreams () =
|
||||||
args.jsondecompressstreams <- true
|
args.jsondecompressstreams <- true
|
||||||
|
|
||||||
|
let setjsoncleanstrings () =
|
||||||
|
args.jsoncleanstrings <- true
|
||||||
|
|
||||||
let setocgrenamefrom s =
|
let setocgrenamefrom s =
|
||||||
args.ocgrenamefrom <- s
|
args.ocgrenamefrom <- s
|
||||||
|
|
||||||
|
@ -2426,6 +2432,9 @@ and specs =
|
||||||
("-output-json-decompress-streams",
|
("-output-json-decompress-streams",
|
||||||
Arg.Unit setjsondecompressstreams,
|
Arg.Unit setjsondecompressstreams,
|
||||||
" Skip stream data for brevity");
|
" Skip stream data for brevity");
|
||||||
|
("-output-json-clean-strings",
|
||||||
|
Arg.Unit setjsoncleanstrings,
|
||||||
|
" Convert UTF16BE strings to PDFDocEncoding when possible");
|
||||||
("-j",
|
("-j",
|
||||||
Arg.String set_json_input,
|
Arg.String set_json_input,
|
||||||
" Load a PDF JSON file");
|
" Load a PDF JSON file");
|
||||||
|
@ -2959,6 +2968,7 @@ let write_json output pdf =
|
||||||
~parse_content:args.jsonparsecontentstreams
|
~parse_content:args.jsonparsecontentstreams
|
||||||
~no_stream_data:args.jsonnostreamdata
|
~no_stream_data:args.jsonnostreamdata
|
||||||
~decompress_streams:args.jsondecompressstreams
|
~decompress_streams:args.jsondecompressstreams
|
||||||
|
~clean_strings:args.jsoncleanstrings
|
||||||
pdf
|
pdf
|
||||||
| File filename ->
|
| File filename ->
|
||||||
let f = open_out filename in
|
let f = open_out filename in
|
||||||
|
@ -2967,6 +2977,7 @@ let write_json output pdf =
|
||||||
~parse_content:args.jsonparsecontentstreams
|
~parse_content:args.jsonparsecontentstreams
|
||||||
~no_stream_data:args.jsonnostreamdata
|
~no_stream_data:args.jsonnostreamdata
|
||||||
~decompress_streams:args.jsondecompressstreams
|
~decompress_streams:args.jsondecompressstreams
|
||||||
|
~clean_strings:args.jsoncleanstrings
|
||||||
pdf;
|
pdf;
|
||||||
close_out f
|
close_out f
|
||||||
|
|
||||||
|
|
34
cpdfjson.ml
34
cpdfjson.ml
|
@ -23,11 +23,12 @@ Objects 1..n: The PDF's objects.
|
||||||
o Names are written as {"N": "/Pages"}
|
o Names are written as {"N": "/Pages"}
|
||||||
o Indirect references are integers
|
o Indirect references are integers
|
||||||
o Streams are {"S": [dict, data]}
|
o Streams are {"S": [dict, data]}
|
||||||
|
o Strings are converted from UTF16BE/PDFDocEncoding to UTF8 before being
|
||||||
o Strings are converted from PDFDocEncoding to UTF8 before being encoded in
|
encoded in JSON. When they are read back the process is JSON encoded --> UTF8
|
||||||
JSON. When they are read back the process is JSON encoded --> UTF8 -->
|
--> UTF16BE/PDFDocEncoding. This process is fully reversible: it is to allow
|
||||||
PDFDocEncoding. This process is to allow easier editing of strings. This
|
easier editing of strings. This does not happen to strings within text
|
||||||
does not happen to strings within text operators in parsed content streams.
|
operators in parsed content streams, nor to /ID values in the
|
||||||
|
trailerdictionary, since neither is UTF16BE/PdfDocEncoding to begin with.
|
||||||
|
|
||||||
There are two subformats: parsing content streams or not. Hello World in CPDF
|
There are two subformats: parsing content streams or not. Hello World in CPDF
|
||||||
JSON without parsing content streams:
|
JSON without parsing content streams:
|
||||||
|
@ -81,7 +82,7 @@ When parsing content streams:
|
||||||
|
|
||||||
CPDF currently never preserves object streams, and only outputs unencrypted files.
|
CPDF currently never preserves object streams, and only outputs unencrypted files.
|
||||||
|
|
||||||
When reloading a JSON file, CPDF knows how to correct /Length entries in
|
When reloading a JSON file, CPDF knows how to correct or add /Length entries in
|
||||||
streams, so you need not worry about them. *)
|
streams, so you need not worry about them. *)
|
||||||
|
|
||||||
open Pdfutil
|
open Pdfutil
|
||||||
|
@ -277,12 +278,12 @@ let mkfloat f = `Assoc [("F", `Float f)]
|
||||||
let mkint i = `Assoc [("I", `Int i)]
|
let mkint i = `Assoc [("I", `Int i)]
|
||||||
let mkname n = `Assoc [("N", `String n)]
|
let mkname n = `Assoc [("N", `String n)]
|
||||||
|
|
||||||
let rec json_of_object pdf fcs no_stream_data pcs = function
|
let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = function
|
||||||
| P.Null -> `Null
|
| P.Null -> `Null
|
||||||
| P.Boolean b -> `Bool b
|
| P.Boolean b -> `Bool b
|
||||||
| P.Integer i -> mkint i
|
| P.Integer i -> mkint i
|
||||||
| P.Real r -> mkfloat r
|
| P.Real r -> mkfloat r
|
||||||
| P.String s -> `String s
|
| P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else s)
|
||||||
| P.Name n -> mkname n
|
| P.Name n -> mkname n
|
||||||
| P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data pcs) objs)
|
| P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data pcs) objs)
|
||||||
| P.Dictionary elts ->
|
| P.Dictionary elts ->
|
||||||
|
@ -453,9 +454,8 @@ let precombine_page_content pdf =
|
||||||
in
|
in
|
||||||
Pdfpage.change_pages true pdf pages'
|
Pdfpage.change_pages true pdf pages'
|
||||||
|
|
||||||
(* FIXME make this optional? And maybe move into actual JSON reader, instead of
|
(* Convert any strings in UTF16BE which could actually be in PDFDocEncoding
|
||||||
preprocessing PDF, so it helps us when writing, say, the output of
|
(due to having no high bytes) to make editing JSON easier. *)
|
||||||
-print-dict-entry? *)
|
|
||||||
let rec ppstring_single_object pdf = function
|
let rec ppstring_single_object pdf = function
|
||||||
| Pdf.Dictionary d -> Pdf.recurse_dict (ppstring_single_object pdf) d
|
| Pdf.Dictionary d -> Pdf.recurse_dict (ppstring_single_object pdf) d
|
||||||
| (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) ->
|
| (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) ->
|
||||||
|
@ -464,16 +464,16 @@ let rec ppstring_single_object pdf = function
|
||||||
| Pdf.String s -> Pdf.String (Pdftext.simplify_utf16be s)
|
| Pdf.String s -> Pdf.String (Pdftext.simplify_utf16be s)
|
||||||
| x -> x
|
| x -> x
|
||||||
|
|
||||||
|
(* Do all objects, but skip the trailer dictionary since may mess up /ID if it
|
||||||
|
happens to begin with UTF16BE BOM *)
|
||||||
let preprocess_strings pdf =
|
let preprocess_strings pdf =
|
||||||
Pdf.objselfmap (ppstring_single_object pdf) pdf
|
Pdf.objselfmap (ppstring_single_object pdf) pdf
|
||||||
(* Skip the trailer dictionary since may mess up /ID if it happens to begin with UTF16BE BOM *)
|
|
||||||
(*pdf.Pdf.trailerdict <- ppstring_single_object pdf pdf.Pdf.trailerdict*)
|
|
||||||
|
|
||||||
let json_of_pdf
|
let json_of_pdf
|
||||||
~parse_content ~no_stream_data ~decompress_streams
|
~parse_content ~no_stream_data ~decompress_streams ~clean_strings
|
||||||
pdf
|
pdf
|
||||||
=
|
=
|
||||||
preprocess_strings pdf;
|
if clean_strings then preprocess_strings pdf;
|
||||||
let pdf = if parse_content then precombine_page_content pdf else pdf in
|
let pdf = if parse_content then precombine_page_content pdf else pdf in
|
||||||
if decompress_streams then
|
if decompress_streams then
|
||||||
Pdf.objiter (fun _ obj -> Pdfcodec.decode_pdfstream_until_unknown pdf obj) pdf;
|
Pdf.objiter (fun _ obj -> Pdfcodec.decode_pdfstream_until_unknown pdf obj) pdf;
|
||||||
|
@ -532,8 +532,8 @@ let json_of_pdf
|
||||||
(fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj])
|
(fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj])
|
||||||
pairs_parsed)
|
pairs_parsed)
|
||||||
|
|
||||||
let to_output o ~parse_content ~no_stream_data ~decompress_streams pdf =
|
let to_output o ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf =
|
||||||
let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams pdf in
|
let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in
|
||||||
match o.Pdfio.out_caml_channel with
|
match o.Pdfio.out_caml_channel with
|
||||||
| Some ch -> J.pretty_to_channel ch json
|
| Some ch -> J.pretty_to_channel ch json
|
||||||
| None -> o.Pdfio.output_string (J.pretty_to_string json)
|
| None -> o.Pdfio.output_string (J.pretty_to_string json)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
val json_of_object : Pdf.t -> (int -> unit) -> bool -> bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t
|
val json_of_object : ?clean_strings:bool -> Pdf.t -> (int -> unit) -> bool -> bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t
|
||||||
val object_of_json : Cpdfyojson.Safe.t -> Pdf.pdfobject
|
val object_of_json : Cpdfyojson.Safe.t -> Pdf.pdfobject
|
||||||
val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> Pdf.t -> unit
|
val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit
|
||||||
val of_input : Pdfio.input -> Pdf.t
|
val of_input : Pdfio.input -> Pdf.t
|
||||||
|
|
BIN
cpdfmanual.pdf
BIN
cpdfmanual.pdf
Binary file not shown.
|
@ -1558,6 +1558,9 @@ There are two options which turn off parts of the squeezer. They are \texttt{-sq
|
||||||
\vspace{1.5mm}
|
\vspace{1.5mm}
|
||||||
\small\noindent\verb!cpdf -add-bookmarks <bookmark file> in.pdf -o out.pdf!
|
\small\noindent\verb!cpdf -add-bookmarks <bookmark file> in.pdf -o out.pdf!
|
||||||
|
|
||||||
|
\vspace{1.5mm}
|
||||||
|
\small\noindent\verb!cpdf -add-bookmarks-json <bookmark file> in.pdf -o out.pdf!
|
||||||
|
|
||||||
\vspace{1.5mm}
|
\vspace{1.5mm}
|
||||||
\small\noindent\verb!cpdf -bookmarks-open-to-level <n> in.pdf -o out.pdf!
|
\small\noindent\verb!cpdf -bookmarks-open-to-level <n> in.pdf -o out.pdf!
|
||||||
|
|
||||||
|
@ -1666,6 +1669,14 @@ will be given if the bookmarks file is not in the correct form (in particular,
|
||||||
the numbers in the first column which specify the level must form a proper
|
the numbers in the first column which specify the level must form a proper
|
||||||
tree with no entry being more than one greater than the last).
|
tree with no entry being more than one greater than the last).
|
||||||
|
|
||||||
|
Bookmarks in JSON format (see above) may be added with \texttt{-add-bookmarks-json}:
|
||||||
|
|
||||||
|
\begin{framed}
|
||||||
|
\small\verb!cpdf -add-bookmarks-json bookmarks.json in.pdf -o out.pdf!
|
||||||
|
\end{framed}
|
||||||
|
|
||||||
|
Remember that strings in JSON bookmark files are in UTF8, rather than as native PDF strings.
|
||||||
|
|
||||||
\section{Opening bookmarks}
|
\section{Opening bookmarks}
|
||||||
\index{bookmarks!opening at level}
|
\index{bookmarks!opening at level}
|
||||||
As an alternative to extracting a bookmark file and manipulating the open-status of bookmarks, mass manipulation may be achieved by the following operation:
|
As an alternative to extracting a bookmark file and manipulating the open-status of bookmarks, mass manipulation may be achieved by the following operation:
|
||||||
|
|
|
@ -274,7 +274,7 @@ let print_dict_entry pdf key =
|
||||||
match Pdf.lookup_direct pdf key d with
|
match Pdf.lookup_direct pdf key d with
|
||||||
| Some v ->
|
| Some v ->
|
||||||
(* We use a double newline as a separator. *)
|
(* We use a double newline as a separator. *)
|
||||||
Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object pdf (fun _ -> ()) false false v));
|
Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false v));
|
||||||
d
|
d
|
||||||
| None -> d
|
| None -> d
|
||||||
in
|
in
|
||||||
|
|
Loading…
Reference in New Issue