This commit is contained in:
John Whitington 2023-01-16 14:29:54 +08:00
parent 7a7f3eba29
commit 7a8a1267e4
5 changed files with 29 additions and 24 deletions

View File

@ -74,7 +74,7 @@ let annotations_json_page pdf page pagenum =
let annot = Pdf.direct pdf annot in
let annot = rewrite_destinations pdf annot in
extra := annot::!extra;
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false annot])
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false annot])
annots
| _ -> []
@ -127,7 +127,7 @@ let postprocess_json pdf objnum_to_serial_map json =
| `List [`Int pagenum; `Int serial; jo] ->
let pdfobj = Cpdfjson.object_of_json jo in
let fixed = postprocess_json_pdf objnum_to_serial_map pdf pdfobj in
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false fixed]
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false fixed]
| _ -> assert false)
json
@ -149,13 +149,13 @@ let list_annotations_json range pdf =
let extra =
map
(fun n ->
`List [`Int ~-n; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false (Pdf.lookup_obj pdf n)])
`List [`Int ~-n; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false (Pdf.lookup_obj pdf n)])
(setify (flatten (map (Pdf.objects_referenced [] [] pdf) extra)))
in
let header =
`List
[`Int 0;
Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false
Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false
(Pdf.Dictionary ["/CPDFJSONannotformatversion", Pdf.Integer 1])]
in
let json = `List ([header] @ json @ extra) in

View File

@ -176,8 +176,8 @@ let json_of_target pdf fastrefnums x =
let a =
Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more)
in
Cpdfjson.json_of_object pdf (fun _ -> ()) false false a
| x -> Cpdfjson.json_of_object pdf (fun _ -> ()) false false x
Cpdfjson.json_of_object pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false a
| x -> Cpdfjson.json_of_object pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false x
let output_json_marks output calculate_page_number pdf fastrefnums marks =
let module J = Cpdfyojson.Safe in

View File

@ -1,17 +1,22 @@
(* Read and write PDF files in JSON format.
Format version 3: adds UTF8 option
The file is an array of arrays containing an object number followed by an
object, one for each object in the file and two special ones:
Object -1: CPDF's own data with the PDF version number, CPDF JSON format
number, and flags used when writing (which may be required when reading):
o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 2)
o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 3)
o /CPDFJSONcontentparsed (boolean, true if content streams have been parsed)
o /CPDFJSONstreamdataincluded (boolean, true if stream data included. Cannot
round-trip if false).
o /CPDFJSONmajorpdfversion (CPDFJSON integer)
o /CPDFJSONminorpdfversion (CPDFJSON integer)
o /CPDFJSONisUTF8 (Optional. Format 3. If true, strings are converted to UTF8
before conversion to JSON, and converted back to PDFDocEncoding/UTF16BE during
converstion to PDF.
Object 0: The PDF's trailer dictionary
@ -273,14 +278,14 @@ let mkfloat f = `Assoc [("F", `Float f)]
let mkint i = `Assoc [("I", `Int i)]
let mkname n = `Assoc [("N", `String n)]
let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = function
let rec json_of_object ?(utf8=false) ?(clean_strings=false) pdf fcs ~no_stream_data ~parse_content = function
| P.Null -> `Null
| P.Boolean b -> `Bool b
| P.Integer i -> mkint i
| P.Real r -> mkfloat r
| P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else s)
| P.Name n -> mkname n
| P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data pcs) objs)
| P.Array objs -> `List (map (json_of_object pdf fcs ~no_stream_data ~parse_content) objs)
| P.Dictionary elts ->
iter
(function
@ -292,11 +297,11 @@ let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = funct
| ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts
| _ -> ())
elts;
`Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs no_stream_data pcs v)) elts)
`Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs ~no_stream_data ~parse_content v)) elts)
| P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream ->
P.getstream thestream;
let str, dict' =
match P.lookup_direct pdf "/FunctionType" d, pcs with
match P.lookup_direct pdf "/FunctionType" d, parse_content with
| Some _, true ->
Pdfcodec.decode_pdfstream_until_unknown pdf thestream;
let dict = P.remove_dict_entry d "/Filter" in
@ -305,7 +310,7 @@ let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = funct
if no_stream_data then ("<<stream data elided>>", d) else
match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, d) | _ -> error "failure: toget"
in
json_of_object pdf fcs no_stream_data pcs (P.Dictionary [("S", P.Array [dict'; P.String str])])
json_of_object pdf fcs ~no_stream_data ~parse_content (P.Dictionary [("S", P.Array [dict'; P.String str])])
| P.Stream _ -> error "error: stream with not-a-dictionary"
| P.Indirect i ->
begin match P.lookup_obj pdf i with
@ -350,7 +355,7 @@ let json_of_op pdf no_stream_data = function
`List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "k"]
| O.Op_m (a, b) -> `List [mkfloat a; mkfloat b; `String "m"]
| O.Op_l (a, b) -> `List [mkfloat a; mkfloat b; `String "l"]
| O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data false obj; `String "BDC"]
| O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "BDC"]
| O.Op_gs s -> `List [`String s; `String "gs"]
| O.Op_Do s -> `List [`String s; `String "Do"]
| O.Op_CS s -> `List [`String s; `String "CS"]
@ -395,7 +400,7 @@ let json_of_op pdf no_stream_data = function
mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f;
`String "Tm"]
| O.Op_Tj s -> `List [`String s; `String "Tj"]
| O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) no_stream_data false pdfobject; `String "TJ"]
| O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false pdfobject; `String "TJ"]
| O.Op_' s -> `List [`String s; `String "'"]
| O.Op_'' (k, k', s) -> `List [mkfloat k; mkfloat k'; `String s; `String "''"]
| O.Op_d0 (k, k') -> `List [mkfloat k; mkfloat k'; `String "d0"]
@ -421,9 +426,9 @@ let json_of_op pdf no_stream_data = function
| O.Op_scnName (s, fs) ->
`List (map (fun x -> mkfloat x) fs @ [`String s; `String "scnName"])
| O.InlineImage (dict, data) ->
`List [json_of_object pdf (fun _ -> ()) no_stream_data false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"]
`List [json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"]
| O.Op_DP (s, obj) ->
`List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data false obj; `String "DP"]
`List [`String s; json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "DP"]
(* parse_stream needs pdf and resources. These are for lexing of inline images,
* looking up the colourspace. *)
@ -475,9 +480,9 @@ let json_of_pdf
(fun _ obj -> match obj with Pdf.Stream _ -> Pdfcodec.decode_pdfstream_until_unknown pdf obj | _ -> ())
pdf;
Pdf.remove_unreferenced pdf;
let trailerdict = (0, json_of_object pdf (fun x -> ()) no_stream_data false pdf.P.trailerdict) in
let trailerdict = (0, json_of_object pdf (fun x -> ()) ~no_stream_data ~parse_content:false pdf.P.trailerdict) in
let parameters =
(-1, json_of_object pdf (fun x -> ()) false false
(-1, json_of_object pdf (fun x -> ()) ~no_stream_data:false ~parse_content:false
(Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2);
("/CPDFJSONcontentparsed", Pdf.Boolean parse_content);
("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data));
@ -494,7 +499,7 @@ let json_of_pdf
let ps = ref [] in
P.objiter
(fun i pdfobj ->
ps := (i, json_of_object pdf fcs no_stream_data parse_content pdfobj)::!ps)
ps := (i, json_of_object pdf fcs ~no_stream_data ~parse_content pdfobj)::!ps)
pdf;
parameters::trailerdict::sort compare !ps
in
@ -529,7 +534,7 @@ let json_of_pdf
(fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj])
pairs_parsed)
let to_output o ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf =
let to_output o ?(utf8=false) ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf =
let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in
match o.Pdfio.out_caml_channel with
| Some ch -> J.pretty_to_channel ch json

View File

@ -4,7 +4,7 @@
streams, [no_stream_data] will omit stream data, [decompress_streams]
decompresses all streams, [clean_strings] will convert any UTF16BE strings
to PDFDocEncoding if it can. *)
val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit
val to_output : Pdfio.output -> ?utf8:bool -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit
(** Read a CPDFJSON PDF from an input. /Length entries will be corrected automatically. *)
val of_input : Pdfio.input -> Pdf.t
@ -13,7 +13,7 @@ val of_input : Pdfio.input -> Pdf.t
Then the PDF file, then a function which is usually [function _ -> ()], then
[no_stream_data] as above, then [parse_content_streams] as above, and
finally the object itself. *)
val json_of_object : ?clean_strings:bool -> Pdf.t -> (int -> unit) -> bool -> bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t
val json_of_object : ?utf8:bool -> ?clean_strings:bool -> Pdf.t -> (int -> unit) -> no_stream_data:bool -> parse_content:bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t
(** Convert a single CPDFJSON object to a PDF object *)
val object_of_json : Cpdfyojson.Safe.t -> Pdf.pdfobject

View File

@ -269,7 +269,7 @@ let print_dict_entry pdf key =
match Pdf.lookup_direct pdf key d with
| Some v ->
(* We use a double newline as a separator. *)
Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false v));
Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v));
d
| None -> d
in
@ -281,7 +281,7 @@ let get_dict_entries pdf key =
let es = ref [] in
let f d =
match Pdf.lookup_direct pdf key d with
| Some v -> es := Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false v::!es; d
| Some v -> es := Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v::!es; d
| None -> d
in
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;