diff --git a/cpdfannot.ml b/cpdfannot.ml index 1510395..e44a359 100644 --- a/cpdfannot.ml +++ b/cpdfannot.ml @@ -26,7 +26,7 @@ let annotations_json_page pdf page pagenum = | Some (Pdf.Array annots) -> map (fun annot -> - `List [`Int pagenum; Cpdfjson.json_of_object pdf (fun _ -> ()) false false annot]) + `List [`Int pagenum; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false annot]) (map (Pdf.direct pdf) annots) | _ -> [] diff --git a/cpdfbookmarks.ml b/cpdfbookmarks.ml index 1121586..04198bb 100644 --- a/cpdfbookmarks.ml +++ b/cpdfbookmarks.ml @@ -184,7 +184,7 @@ let output_json_marks ch calculate_page_number pdf fastrefnums marks = let json_of_mark m = `Assoc [("level", `Int m.Pdfmarks.level); - ("text", `String (Pdftext.utf8_of_pdfdocstring m.Pdfmarks.text)); + ("text", `String (Pdftext.utf8_of_pdfdocstring (Pdftext.simplify_utf16be m.Pdfmarks.text))); ("page", `Int (calculate_page_number m)); ("open", `Bool m.Pdfmarks.isopen); ("target", json_of_target pdf fastrefnums m.Pdfmarks.target)] diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 639ecac..3f0fc98 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -462,6 +462,7 @@ type args = mutable jsonparsecontentstreams : bool; mutable jsonnostreamdata : bool; mutable jsondecompressstreams : bool; + mutable jsoncleanstrings : bool; mutable ocgrenamefrom : string; mutable ocgrenameto : string; mutable dedup : bool; @@ -581,6 +582,7 @@ let args = jsonparsecontentstreams = false; jsonnostreamdata = false; jsondecompressstreams = false; + jsoncleanstrings = false; ocgrenamefrom = ""; ocgrenameto = ""; dedup = false; @@ -685,6 +687,7 @@ let reset_arguments () = args.jsonparsecontentstreams <- false; args.jsonnostreamdata <- false; args.jsondecompressstreams <- false; + args.jsoncleanstrings <- false; args.ocgrenamefrom <- ""; args.ocgrenameto <- ""; args.dedup <- false; @@ -1614,6 +1617,9 @@ let setjsonnostreamdata () = let setjsondecompressstreams () = args.jsondecompressstreams <- true +let setjsoncleanstrings () = + args.jsoncleanstrings <- true + let setocgrenamefrom s = args.ocgrenamefrom <- s @@ -2426,6 +2432,9 @@ and specs = ("-output-json-decompress-streams", Arg.Unit setjsondecompressstreams, " Skip stream data for brevity"); + ("-output-json-clean-strings", + Arg.Unit setjsoncleanstrings, + " Convert UTF16BE strings to PDFDocEncoding when possible"); ("-j", Arg.String set_json_input, " Load a PDF JSON file"); @@ -2959,6 +2968,7 @@ let write_json output pdf = ~parse_content:args.jsonparsecontentstreams ~no_stream_data:args.jsonnostreamdata ~decompress_streams:args.jsondecompressstreams + ~clean_strings:args.jsoncleanstrings pdf | File filename -> let f = open_out filename in @@ -2967,6 +2977,7 @@ let write_json output pdf = ~parse_content:args.jsonparsecontentstreams ~no_stream_data:args.jsonnostreamdata ~decompress_streams:args.jsondecompressstreams + ~clean_strings:args.jsoncleanstrings pdf; close_out f diff --git a/cpdfjson.ml b/cpdfjson.ml index 3162c32..a9e2e21 100644 --- a/cpdfjson.ml +++ b/cpdfjson.ml @@ -23,11 +23,12 @@ Objects 1..n: The PDF's objects. o Names are written as {"N": "/Pages"} o Indirect references are integers o Streams are {"S": [dict, data]} - - o Strings are converted from PDFDocEncoding to UTF8 before being encoded in - JSON. When they are read back the process is JSON encoded --> UTF8 --> - PDFDocEncoding. This process is to allow easier editing of strings. This - does not happen to strings within text operators in parsed content streams. + o Strings are converted from UTF16BE/PDFDocEncoding to UTF8 before being + encoded in JSON. When they are read back the process is JSON encoded --> UTF8 + --> UTF16BE/PDFDocEncoding. This process is fully reversible: it is to allow + easier editing of strings. This does not happen to strings within text + operators in parsed content streams, nor to /ID values in the + trailerdictionary, since neither is UTF16BE/PdfDocEncoding to begin with. There are two subformats: parsing content streams or not. Hello World in CPDF JSON without parsing content streams: @@ -81,7 +82,7 @@ When parsing content streams: CPDF currently never preserves object streams, and only outputs unencrypted files. -When reloading a JSON file, CPDF knows how to correct /Length entries in +When reloading a JSON file, CPDF knows how to correct or add /Length entries in streams, so you need not worry about them. *) open Pdfutil @@ -277,12 +278,12 @@ let mkfloat f = `Assoc [("F", `Float f)] let mkint i = `Assoc [("I", `Int i)] let mkname n = `Assoc [("N", `String n)] -let rec json_of_object pdf fcs no_stream_data pcs = function +let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = function | P.Null -> `Null | P.Boolean b -> `Bool b | P.Integer i -> mkint i | P.Real r -> mkfloat r - | P.String s -> `String s + | P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else s) | P.Name n -> mkname n | P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data pcs) objs) | P.Dictionary elts -> @@ -453,9 +454,8 @@ let precombine_page_content pdf = in Pdfpage.change_pages true pdf pages' -(* FIXME make this optional? And maybe move into actual JSON reader, instead of - preprocessing PDF, so it helps us when writing, say, the output of - -print-dict-entry? *) +(* Convert any strings in UTF16BE which could actually be in PDFDocEncoding + (due to having no high bytes) to make editing JSON easier. *) let rec ppstring_single_object pdf = function | Pdf.Dictionary d -> Pdf.recurse_dict (ppstring_single_object pdf) d | (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) -> @@ -464,16 +464,16 @@ let rec ppstring_single_object pdf = function | Pdf.String s -> Pdf.String (Pdftext.simplify_utf16be s) | x -> x +(* Do all objects, but skip the trailer dictionary since may mess up /ID if it + happens to begin with UTF16BE BOM *) let preprocess_strings pdf = Pdf.objselfmap (ppstring_single_object pdf) pdf - (* Skip the trailer dictionary since may mess up /ID if it happens to begin with UTF16BE BOM *) - (*pdf.Pdf.trailerdict <- ppstring_single_object pdf pdf.Pdf.trailerdict*) let json_of_pdf - ~parse_content ~no_stream_data ~decompress_streams + ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf = - preprocess_strings pdf; + if clean_strings then preprocess_strings pdf; let pdf = if parse_content then precombine_page_content pdf else pdf in if decompress_streams then Pdf.objiter (fun _ obj -> Pdfcodec.decode_pdfstream_until_unknown pdf obj) pdf; @@ -532,8 +532,8 @@ let json_of_pdf (fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj]) pairs_parsed) -let to_output o ~parse_content ~no_stream_data ~decompress_streams pdf = - let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams pdf in +let to_output o ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf = + let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in match o.Pdfio.out_caml_channel with | Some ch -> J.pretty_to_channel ch json | None -> o.Pdfio.output_string (J.pretty_to_string json) diff --git a/cpdfjson.mli b/cpdfjson.mli index 36f7724..0675c6a 100644 --- a/cpdfjson.mli +++ b/cpdfjson.mli @@ -1,4 +1,4 @@ -val json_of_object : Pdf.t -> (int -> unit) -> bool -> bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t +val json_of_object : ?clean_strings:bool -> Pdf.t -> (int -> unit) -> bool -> bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t val object_of_json : Cpdfyojson.Safe.t -> Pdf.pdfobject -val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> Pdf.t -> unit +val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit val of_input : Pdfio.input -> Pdf.t diff --git a/cpdfmanual.pdf b/cpdfmanual.pdf index 5db3b2f..d6d36a1 100644 Binary files a/cpdfmanual.pdf and b/cpdfmanual.pdf differ diff --git a/cpdfmanual.tex b/cpdfmanual.tex index fc4f123..ce4385b 100644 --- a/cpdfmanual.tex +++ b/cpdfmanual.tex @@ -1558,6 +1558,9 @@ There are two options which turn off parts of the squeezer. They are \texttt{-sq \vspace{1.5mm} \small\noindent\verb!cpdf -add-bookmarks in.pdf -o out.pdf! + \vspace{1.5mm} + \small\noindent\verb!cpdf -add-bookmarks-json in.pdf -o out.pdf! + \vspace{1.5mm} \small\noindent\verb!cpdf -bookmarks-open-to-level in.pdf -o out.pdf! @@ -1666,6 +1669,14 @@ will be given if the bookmarks file is not in the correct form (in particular, the numbers in the first column which specify the level must form a proper tree with no entry being more than one greater than the last). +Bookmarks in JSON format (see above) may be added with \texttt{-add-bookmarks-json}: + + \begin{framed} + \small\verb!cpdf -add-bookmarks-json bookmarks.json in.pdf -o out.pdf! + \end{framed} + +Remember that strings in JSON bookmark files are in UTF8, rather than as native PDF strings. + \section{Opening bookmarks} \index{bookmarks!opening at level} As an alternative to extracting a bookmark file and manipulating the open-status of bookmarks, mass manipulation may be achieved by the following operation: diff --git a/cpdftweak.ml b/cpdftweak.ml index 9187e86..89f64be 100644 --- a/cpdftweak.ml +++ b/cpdftweak.ml @@ -274,7 +274,7 @@ let print_dict_entry pdf key = match Pdf.lookup_direct pdf key d with | Some v -> (* We use a double newline as a separator. *) - Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object pdf (fun _ -> ()) false false v)); + Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false v)); d | None -> d in