This commit is contained in:
John Whitington 2023-01-16 14:29:54 +08:00
parent 7a7f3eba29
commit 7a8a1267e4
5 changed files with 29 additions and 24 deletions

View File

@ -74,7 +74,7 @@ let annotations_json_page pdf page pagenum =
let annot = Pdf.direct pdf annot in let annot = Pdf.direct pdf annot in
let annot = rewrite_destinations pdf annot in let annot = rewrite_destinations pdf annot in
extra := annot::!extra; extra := annot::!extra;
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false annot]) `List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false annot])
annots annots
| _ -> [] | _ -> []
@ -127,7 +127,7 @@ let postprocess_json pdf objnum_to_serial_map json =
| `List [`Int pagenum; `Int serial; jo] -> | `List [`Int pagenum; `Int serial; jo] ->
let pdfobj = Cpdfjson.object_of_json jo in let pdfobj = Cpdfjson.object_of_json jo in
let fixed = postprocess_json_pdf objnum_to_serial_map pdf pdfobj in let fixed = postprocess_json_pdf objnum_to_serial_map pdf pdfobj in
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false fixed] `List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false fixed]
| _ -> assert false) | _ -> assert false)
json json
@ -149,13 +149,13 @@ let list_annotations_json range pdf =
let extra = let extra =
map map
(fun n -> (fun n ->
`List [`Int ~-n; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false (Pdf.lookup_obj pdf n)]) `List [`Int ~-n; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false (Pdf.lookup_obj pdf n)])
(setify (flatten (map (Pdf.objects_referenced [] [] pdf) extra))) (setify (flatten (map (Pdf.objects_referenced [] [] pdf) extra)))
in in
let header = let header =
`List `List
[`Int 0; [`Int 0;
Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false
(Pdf.Dictionary ["/CPDFJSONannotformatversion", Pdf.Integer 1])] (Pdf.Dictionary ["/CPDFJSONannotformatversion", Pdf.Integer 1])]
in in
let json = `List ([header] @ json @ extra) in let json = `List ([header] @ json @ extra) in

View File

@ -176,8 +176,8 @@ let json_of_target pdf fastrefnums x =
let a = let a =
Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more) Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more)
in in
Cpdfjson.json_of_object pdf (fun _ -> ()) false false a Cpdfjson.json_of_object pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false a
| x -> Cpdfjson.json_of_object pdf (fun _ -> ()) false false x | x -> Cpdfjson.json_of_object pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false x
let output_json_marks output calculate_page_number pdf fastrefnums marks = let output_json_marks output calculate_page_number pdf fastrefnums marks =
let module J = Cpdfyojson.Safe in let module J = Cpdfyojson.Safe in

View File

@ -1,17 +1,22 @@
(* Read and write PDF files in JSON format. (* Read and write PDF files in JSON format.
Format version 3: adds UTF8 option
The file is an array of arrays containing an object number followed by an The file is an array of arrays containing an object number followed by an
object, one for each object in the file and two special ones: object, one for each object in the file and two special ones:
Object -1: CPDF's own data with the PDF version number, CPDF JSON format Object -1: CPDF's own data with the PDF version number, CPDF JSON format
number, and flags used when writing (which may be required when reading): number, and flags used when writing (which may be required when reading):
o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 2) o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 3)
o /CPDFJSONcontentparsed (boolean, true if content streams have been parsed) o /CPDFJSONcontentparsed (boolean, true if content streams have been parsed)
o /CPDFJSONstreamdataincluded (boolean, true if stream data included. Cannot o /CPDFJSONstreamdataincluded (boolean, true if stream data included. Cannot
round-trip if false). round-trip if false).
o /CPDFJSONmajorpdfversion (CPDFJSON integer) o /CPDFJSONmajorpdfversion (CPDFJSON integer)
o /CPDFJSONminorpdfversion (CPDFJSON integer) o /CPDFJSONminorpdfversion (CPDFJSON integer)
o /CPDFJSONisUTF8 (Optional. Format 3. If true, strings are converted to UTF8
before conversion to JSON, and converted back to PDFDocEncoding/UTF16BE during
converstion to PDF.
Object 0: The PDF's trailer dictionary Object 0: The PDF's trailer dictionary
@ -273,14 +278,14 @@ let mkfloat f = `Assoc [("F", `Float f)]
let mkint i = `Assoc [("I", `Int i)] let mkint i = `Assoc [("I", `Int i)]
let mkname n = `Assoc [("N", `String n)] let mkname n = `Assoc [("N", `String n)]
let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = function let rec json_of_object ?(utf8=false) ?(clean_strings=false) pdf fcs ~no_stream_data ~parse_content = function
| P.Null -> `Null | P.Null -> `Null
| P.Boolean b -> `Bool b | P.Boolean b -> `Bool b
| P.Integer i -> mkint i | P.Integer i -> mkint i
| P.Real r -> mkfloat r | P.Real r -> mkfloat r
| P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else s) | P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else s)
| P.Name n -> mkname n | P.Name n -> mkname n
| P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data pcs) objs) | P.Array objs -> `List (map (json_of_object pdf fcs ~no_stream_data ~parse_content) objs)
| P.Dictionary elts -> | P.Dictionary elts ->
iter iter
(function (function
@ -292,11 +297,11 @@ let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = funct
| ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts | ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts
| _ -> ()) | _ -> ())
elts; elts;
`Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs no_stream_data pcs v)) elts) `Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs ~no_stream_data ~parse_content v)) elts)
| P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream -> | P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream ->
P.getstream thestream; P.getstream thestream;
let str, dict' = let str, dict' =
match P.lookup_direct pdf "/FunctionType" d, pcs with match P.lookup_direct pdf "/FunctionType" d, parse_content with
| Some _, true -> | Some _, true ->
Pdfcodec.decode_pdfstream_until_unknown pdf thestream; Pdfcodec.decode_pdfstream_until_unknown pdf thestream;
let dict = P.remove_dict_entry d "/Filter" in let dict = P.remove_dict_entry d "/Filter" in
@ -305,7 +310,7 @@ let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = funct
if no_stream_data then ("<<stream data elided>>", d) else if no_stream_data then ("<<stream data elided>>", d) else
match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, d) | _ -> error "failure: toget" match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, d) | _ -> error "failure: toget"
in in
json_of_object pdf fcs no_stream_data pcs (P.Dictionary [("S", P.Array [dict'; P.String str])]) json_of_object pdf fcs ~no_stream_data ~parse_content (P.Dictionary [("S", P.Array [dict'; P.String str])])
| P.Stream _ -> error "error: stream with not-a-dictionary" | P.Stream _ -> error "error: stream with not-a-dictionary"
| P.Indirect i -> | P.Indirect i ->
begin match P.lookup_obj pdf i with begin match P.lookup_obj pdf i with
@ -350,7 +355,7 @@ let json_of_op pdf no_stream_data = function
`List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "k"] `List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "k"]
| O.Op_m (a, b) -> `List [mkfloat a; mkfloat b; `String "m"] | O.Op_m (a, b) -> `List [mkfloat a; mkfloat b; `String "m"]
| O.Op_l (a, b) -> `List [mkfloat a; mkfloat b; `String "l"] | O.Op_l (a, b) -> `List [mkfloat a; mkfloat b; `String "l"]
| O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data false obj; `String "BDC"] | O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "BDC"]
| O.Op_gs s -> `List [`String s; `String "gs"] | O.Op_gs s -> `List [`String s; `String "gs"]
| O.Op_Do s -> `List [`String s; `String "Do"] | O.Op_Do s -> `List [`String s; `String "Do"]
| O.Op_CS s -> `List [`String s; `String "CS"] | O.Op_CS s -> `List [`String s; `String "CS"]
@ -395,7 +400,7 @@ let json_of_op pdf no_stream_data = function
mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f; mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f;
`String "Tm"] `String "Tm"]
| O.Op_Tj s -> `List [`String s; `String "Tj"] | O.Op_Tj s -> `List [`String s; `String "Tj"]
| O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) no_stream_data false pdfobject; `String "TJ"] | O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false pdfobject; `String "TJ"]
| O.Op_' s -> `List [`String s; `String "'"] | O.Op_' s -> `List [`String s; `String "'"]
| O.Op_'' (k, k', s) -> `List [mkfloat k; mkfloat k'; `String s; `String "''"] | O.Op_'' (k, k', s) -> `List [mkfloat k; mkfloat k'; `String s; `String "''"]
| O.Op_d0 (k, k') -> `List [mkfloat k; mkfloat k'; `String "d0"] | O.Op_d0 (k, k') -> `List [mkfloat k; mkfloat k'; `String "d0"]
@ -421,9 +426,9 @@ let json_of_op pdf no_stream_data = function
| O.Op_scnName (s, fs) -> | O.Op_scnName (s, fs) ->
`List (map (fun x -> mkfloat x) fs @ [`String s; `String "scnName"]) `List (map (fun x -> mkfloat x) fs @ [`String s; `String "scnName"])
| O.InlineImage (dict, data) -> | O.InlineImage (dict, data) ->
`List [json_of_object pdf (fun _ -> ()) no_stream_data false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"] `List [json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"]
| O.Op_DP (s, obj) -> | O.Op_DP (s, obj) ->
`List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data false obj; `String "DP"] `List [`String s; json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "DP"]
(* parse_stream needs pdf and resources. These are for lexing of inline images, (* parse_stream needs pdf and resources. These are for lexing of inline images,
* looking up the colourspace. *) * looking up the colourspace. *)
@ -475,9 +480,9 @@ let json_of_pdf
(fun _ obj -> match obj with Pdf.Stream _ -> Pdfcodec.decode_pdfstream_until_unknown pdf obj | _ -> ()) (fun _ obj -> match obj with Pdf.Stream _ -> Pdfcodec.decode_pdfstream_until_unknown pdf obj | _ -> ())
pdf; pdf;
Pdf.remove_unreferenced pdf; Pdf.remove_unreferenced pdf;
let trailerdict = (0, json_of_object pdf (fun x -> ()) no_stream_data false pdf.P.trailerdict) in let trailerdict = (0, json_of_object pdf (fun x -> ()) ~no_stream_data ~parse_content:false pdf.P.trailerdict) in
let parameters = let parameters =
(-1, json_of_object pdf (fun x -> ()) false false (-1, json_of_object pdf (fun x -> ()) ~no_stream_data:false ~parse_content:false
(Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2); (Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2);
("/CPDFJSONcontentparsed", Pdf.Boolean parse_content); ("/CPDFJSONcontentparsed", Pdf.Boolean parse_content);
("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data)); ("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data));
@ -494,7 +499,7 @@ let json_of_pdf
let ps = ref [] in let ps = ref [] in
P.objiter P.objiter
(fun i pdfobj -> (fun i pdfobj ->
ps := (i, json_of_object pdf fcs no_stream_data parse_content pdfobj)::!ps) ps := (i, json_of_object pdf fcs ~no_stream_data ~parse_content pdfobj)::!ps)
pdf; pdf;
parameters::trailerdict::sort compare !ps parameters::trailerdict::sort compare !ps
in in
@ -529,7 +534,7 @@ let json_of_pdf
(fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj]) (fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj])
pairs_parsed) pairs_parsed)
let to_output o ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf = let to_output o ?(utf8=false) ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf =
let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in
match o.Pdfio.out_caml_channel with match o.Pdfio.out_caml_channel with
| Some ch -> J.pretty_to_channel ch json | Some ch -> J.pretty_to_channel ch json

View File

@ -4,7 +4,7 @@
streams, [no_stream_data] will omit stream data, [decompress_streams] streams, [no_stream_data] will omit stream data, [decompress_streams]
decompresses all streams, [clean_strings] will convert any UTF16BE strings decompresses all streams, [clean_strings] will convert any UTF16BE strings
to PDFDocEncoding if it can. *) to PDFDocEncoding if it can. *)
val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit val to_output : Pdfio.output -> ?utf8:bool -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit
(** Read a CPDFJSON PDF from an input. /Length entries will be corrected automatically. *) (** Read a CPDFJSON PDF from an input. /Length entries will be corrected automatically. *)
val of_input : Pdfio.input -> Pdf.t val of_input : Pdfio.input -> Pdf.t
@ -13,7 +13,7 @@ val of_input : Pdfio.input -> Pdf.t
Then the PDF file, then a function which is usually [function _ -> ()], then Then the PDF file, then a function which is usually [function _ -> ()], then
[no_stream_data] as above, then [parse_content_streams] as above, and [no_stream_data] as above, then [parse_content_streams] as above, and
finally the object itself. *) finally the object itself. *)
val json_of_object : ?clean_strings:bool -> Pdf.t -> (int -> unit) -> bool -> bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t val json_of_object : ?utf8:bool -> ?clean_strings:bool -> Pdf.t -> (int -> unit) -> no_stream_data:bool -> parse_content:bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t
(** Convert a single CPDFJSON object to a PDF object *) (** Convert a single CPDFJSON object to a PDF object *)
val object_of_json : Cpdfyojson.Safe.t -> Pdf.pdfobject val object_of_json : Cpdfyojson.Safe.t -> Pdf.pdfobject

View File

@ -269,7 +269,7 @@ let print_dict_entry pdf key =
match Pdf.lookup_direct pdf key d with match Pdf.lookup_direct pdf key d with
| Some v -> | Some v ->
(* We use a double newline as a separator. *) (* We use a double newline as a separator. *)
Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false v)); Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v));
d d
| None -> d | None -> d
in in
@ -281,7 +281,7 @@ let get_dict_entries pdf key =
let es = ref [] in let es = ref [] in
let f d = let f d =
match Pdf.lookup_direct pdf key d with match Pdf.lookup_direct pdf key d with
| Some v -> es := Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false v::!es; d | Some v -> es := Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v::!es; d
| None -> d | None -> d
in in
Pdf.objselfmap (dict_entry_single_object f pdf) pdf; Pdf.objselfmap (dict_entry_single_object f pdf) pdf;