diff --git a/cpdfannot.ml b/cpdfannot.ml index d954756..15b73e2 100644 --- a/cpdfannot.ml +++ b/cpdfannot.ml @@ -74,7 +74,7 @@ let annotations_json_page pdf page pagenum = let annot = Pdf.direct pdf annot in let annot = rewrite_destinations pdf annot in extra := annot::!extra; - `List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false annot]) + `List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false annot]) annots | _ -> [] @@ -127,7 +127,7 @@ let postprocess_json pdf objnum_to_serial_map json = | `List [`Int pagenum; `Int serial; jo] -> let pdfobj = Cpdfjson.object_of_json jo in let fixed = postprocess_json_pdf objnum_to_serial_map pdf pdfobj in - `List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false fixed] + `List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false fixed] | _ -> assert false) json @@ -149,13 +149,13 @@ let list_annotations_json range pdf = let extra = map (fun n -> - `List [`Int ~-n; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false (Pdf.lookup_obj pdf n)]) + `List [`Int ~-n; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false (Pdf.lookup_obj pdf n)]) (setify (flatten (map (Pdf.objects_referenced [] [] pdf) extra))) in let header = `List [`Int 0; - Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false + Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false (Pdf.Dictionary ["/CPDFJSONannotformatversion", Pdf.Integer 1])] in let json = `List ([header] @ json @ extra) in diff --git a/cpdfbookmarks.ml b/cpdfbookmarks.ml index f0207a6..ed24b1a 100644 --- a/cpdfbookmarks.ml +++ b/cpdfbookmarks.ml @@ -176,8 +176,8 @@ let json_of_target pdf fastrefnums x = let a = Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more) in - Cpdfjson.json_of_object pdf (fun _ -> ()) false false a - | x -> Cpdfjson.json_of_object pdf (fun _ -> ()) false false x + Cpdfjson.json_of_object pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false a + | x -> Cpdfjson.json_of_object pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false x let output_json_marks output calculate_page_number pdf fastrefnums marks = let module J = Cpdfyojson.Safe in diff --git a/cpdfjson.ml b/cpdfjson.ml index 5cdf688..2dd0954 100644 --- a/cpdfjson.ml +++ b/cpdfjson.ml @@ -1,17 +1,22 @@ (* Read and write PDF files in JSON format. +Format version 3: adds UTF8 option + The file is an array of arrays containing an object number followed by an object, one for each object in the file and two special ones: Object -1: CPDF's own data with the PDF version number, CPDF JSON format number, and flags used when writing (which may be required when reading): - o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 2) + o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 3) o /CPDFJSONcontentparsed (boolean, true if content streams have been parsed) o /CPDFJSONstreamdataincluded (boolean, true if stream data included. Cannot round-trip if false). o /CPDFJSONmajorpdfversion (CPDFJSON integer) o /CPDFJSONminorpdfversion (CPDFJSON integer) + o /CPDFJSONisUTF8 (Optional. Format 3. If true, strings are converted to UTF8 + before conversion to JSON, and converted back to PDFDocEncoding/UTF16BE during + converstion to PDF. Object 0: The PDF's trailer dictionary @@ -273,14 +278,14 @@ let mkfloat f = `Assoc [("F", `Float f)] let mkint i = `Assoc [("I", `Int i)] let mkname n = `Assoc [("N", `String n)] -let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = function +let rec json_of_object ?(utf8=false) ?(clean_strings=false) pdf fcs ~no_stream_data ~parse_content = function | P.Null -> `Null | P.Boolean b -> `Bool b | P.Integer i -> mkint i | P.Real r -> mkfloat r | P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else s) | P.Name n -> mkname n - | P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data pcs) objs) + | P.Array objs -> `List (map (json_of_object pdf fcs ~no_stream_data ~parse_content) objs) | P.Dictionary elts -> iter (function @@ -292,11 +297,11 @@ let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = funct | ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts | _ -> ()) elts; - `Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs no_stream_data pcs v)) elts) + `Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs ~no_stream_data ~parse_content v)) elts) | P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream -> P.getstream thestream; let str, dict' = - match P.lookup_direct pdf "/FunctionType" d, pcs with + match P.lookup_direct pdf "/FunctionType" d, parse_content with | Some _, true -> Pdfcodec.decode_pdfstream_until_unknown pdf thestream; let dict = P.remove_dict_entry d "/Filter" in @@ -305,7 +310,7 @@ let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = funct if no_stream_data then ("<>", d) else match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, d) | _ -> error "failure: toget" in - json_of_object pdf fcs no_stream_data pcs (P.Dictionary [("S", P.Array [dict'; P.String str])]) + json_of_object pdf fcs ~no_stream_data ~parse_content (P.Dictionary [("S", P.Array [dict'; P.String str])]) | P.Stream _ -> error "error: stream with not-a-dictionary" | P.Indirect i -> begin match P.lookup_obj pdf i with @@ -350,7 +355,7 @@ let json_of_op pdf no_stream_data = function `List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "k"] | O.Op_m (a, b) -> `List [mkfloat a; mkfloat b; `String "m"] | O.Op_l (a, b) -> `List [mkfloat a; mkfloat b; `String "l"] - | O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data false obj; `String "BDC"] + | O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "BDC"] | O.Op_gs s -> `List [`String s; `String "gs"] | O.Op_Do s -> `List [`String s; `String "Do"] | O.Op_CS s -> `List [`String s; `String "CS"] @@ -395,7 +400,7 @@ let json_of_op pdf no_stream_data = function mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f; `String "Tm"] | O.Op_Tj s -> `List [`String s; `String "Tj"] - | O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) no_stream_data false pdfobject; `String "TJ"] + | O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false pdfobject; `String "TJ"] | O.Op_' s -> `List [`String s; `String "'"] | O.Op_'' (k, k', s) -> `List [mkfloat k; mkfloat k'; `String s; `String "''"] | O.Op_d0 (k, k') -> `List [mkfloat k; mkfloat k'; `String "d0"] @@ -421,9 +426,9 @@ let json_of_op pdf no_stream_data = function | O.Op_scnName (s, fs) -> `List (map (fun x -> mkfloat x) fs @ [`String s; `String "scnName"]) | O.InlineImage (dict, data) -> - `List [json_of_object pdf (fun _ -> ()) no_stream_data false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"] + `List [json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"] | O.Op_DP (s, obj) -> - `List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data false obj; `String "DP"] + `List [`String s; json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "DP"] (* parse_stream needs pdf and resources. These are for lexing of inline images, * looking up the colourspace. *) @@ -475,9 +480,9 @@ let json_of_pdf (fun _ obj -> match obj with Pdf.Stream _ -> Pdfcodec.decode_pdfstream_until_unknown pdf obj | _ -> ()) pdf; Pdf.remove_unreferenced pdf; - let trailerdict = (0, json_of_object pdf (fun x -> ()) no_stream_data false pdf.P.trailerdict) in + let trailerdict = (0, json_of_object pdf (fun x -> ()) ~no_stream_data ~parse_content:false pdf.P.trailerdict) in let parameters = - (-1, json_of_object pdf (fun x -> ()) false false + (-1, json_of_object pdf (fun x -> ()) ~no_stream_data:false ~parse_content:false (Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2); ("/CPDFJSONcontentparsed", Pdf.Boolean parse_content); ("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data)); @@ -494,7 +499,7 @@ let json_of_pdf let ps = ref [] in P.objiter (fun i pdfobj -> - ps := (i, json_of_object pdf fcs no_stream_data parse_content pdfobj)::!ps) + ps := (i, json_of_object pdf fcs ~no_stream_data ~parse_content pdfobj)::!ps) pdf; parameters::trailerdict::sort compare !ps in @@ -529,7 +534,7 @@ let json_of_pdf (fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj]) pairs_parsed) -let to_output o ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf = +let to_output o ?(utf8=false) ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf = let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in match o.Pdfio.out_caml_channel with | Some ch -> J.pretty_to_channel ch json diff --git a/cpdfjson.mli b/cpdfjson.mli index c77211f..aed4d9f 100644 --- a/cpdfjson.mli +++ b/cpdfjson.mli @@ -4,7 +4,7 @@ streams, [no_stream_data] will omit stream data, [decompress_streams] decompresses all streams, [clean_strings] will convert any UTF16BE strings to PDFDocEncoding if it can. *) -val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit +val to_output : Pdfio.output -> ?utf8:bool -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit (** Read a CPDFJSON PDF from an input. /Length entries will be corrected automatically. *) val of_input : Pdfio.input -> Pdf.t @@ -13,7 +13,7 @@ val of_input : Pdfio.input -> Pdf.t Then the PDF file, then a function which is usually [function _ -> ()], then [no_stream_data] as above, then [parse_content_streams] as above, and finally the object itself. *) -val json_of_object : ?clean_strings:bool -> Pdf.t -> (int -> unit) -> bool -> bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t +val json_of_object : ?utf8:bool -> ?clean_strings:bool -> Pdf.t -> (int -> unit) -> no_stream_data:bool -> parse_content:bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t (** Convert a single CPDFJSON object to a PDF object *) val object_of_json : Cpdfyojson.Safe.t -> Pdf.pdfobject diff --git a/cpdftweak.ml b/cpdftweak.ml index 494b3ad..d8c603b 100644 --- a/cpdftweak.ml +++ b/cpdftweak.ml @@ -269,7 +269,7 @@ let print_dict_entry pdf key = match Pdf.lookup_direct pdf key d with | Some v -> (* We use a double newline as a separator. *) - Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false v)); + Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v)); d | None -> d in @@ -281,7 +281,7 @@ let get_dict_entries pdf key = let es = ref [] in let f d = match Pdf.lookup_direct pdf key d with - | Some v -> es := Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false v::!es; d + | Some v -> es := Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v::!es; d | None -> d in Pdf.objselfmap (dict_entry_single_object f pdf) pdf;