more
This commit is contained in:
parent
7a7f3eba29
commit
7a8a1267e4
|
@ -74,7 +74,7 @@ let annotations_json_page pdf page pagenum =
|
|||
let annot = Pdf.direct pdf annot in
|
||||
let annot = rewrite_destinations pdf annot in
|
||||
extra := annot::!extra;
|
||||
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false annot])
|
||||
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false annot])
|
||||
annots
|
||||
| _ -> []
|
||||
|
||||
|
@ -127,7 +127,7 @@ let postprocess_json pdf objnum_to_serial_map json =
|
|||
| `List [`Int pagenum; `Int serial; jo] ->
|
||||
let pdfobj = Cpdfjson.object_of_json jo in
|
||||
let fixed = postprocess_json_pdf objnum_to_serial_map pdf pdfobj in
|
||||
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false fixed]
|
||||
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false fixed]
|
||||
| _ -> assert false)
|
||||
json
|
||||
|
||||
|
@ -149,13 +149,13 @@ let list_annotations_json range pdf =
|
|||
let extra =
|
||||
map
|
||||
(fun n ->
|
||||
`List [`Int ~-n; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false (Pdf.lookup_obj pdf n)])
|
||||
`List [`Int ~-n; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false (Pdf.lookup_obj pdf n)])
|
||||
(setify (flatten (map (Pdf.objects_referenced [] [] pdf) extra)))
|
||||
in
|
||||
let header =
|
||||
`List
|
||||
[`Int 0;
|
||||
Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false
|
||||
Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false
|
||||
(Pdf.Dictionary ["/CPDFJSONannotformatversion", Pdf.Integer 1])]
|
||||
in
|
||||
let json = `List ([header] @ json @ extra) in
|
||||
|
|
|
@ -176,8 +176,8 @@ let json_of_target pdf fastrefnums x =
|
|||
let a =
|
||||
Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more)
|
||||
in
|
||||
Cpdfjson.json_of_object pdf (fun _ -> ()) false false a
|
||||
| x -> Cpdfjson.json_of_object pdf (fun _ -> ()) false false x
|
||||
Cpdfjson.json_of_object pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false a
|
||||
| x -> Cpdfjson.json_of_object pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false x
|
||||
|
||||
let output_json_marks output calculate_page_number pdf fastrefnums marks =
|
||||
let module J = Cpdfyojson.Safe in
|
||||
|
|
33
cpdfjson.ml
33
cpdfjson.ml
|
@ -1,17 +1,22 @@
|
|||
(* Read and write PDF files in JSON format.
|
||||
|
||||
Format version 3: adds UTF8 option
|
||||
|
||||
The file is an array of arrays containing an object number followed by an
|
||||
object, one for each object in the file and two special ones:
|
||||
|
||||
Object -1: CPDF's own data with the PDF version number, CPDF JSON format
|
||||
number, and flags used when writing (which may be required when reading):
|
||||
|
||||
o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 2)
|
||||
o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 3)
|
||||
o /CPDFJSONcontentparsed (boolean, true if content streams have been parsed)
|
||||
o /CPDFJSONstreamdataincluded (boolean, true if stream data included. Cannot
|
||||
round-trip if false).
|
||||
o /CPDFJSONmajorpdfversion (CPDFJSON integer)
|
||||
o /CPDFJSONminorpdfversion (CPDFJSON integer)
|
||||
o /CPDFJSONisUTF8 (Optional. Format 3. If true, strings are converted to UTF8
|
||||
before conversion to JSON, and converted back to PDFDocEncoding/UTF16BE during
|
||||
converstion to PDF.
|
||||
|
||||
Object 0: The PDF's trailer dictionary
|
||||
|
||||
|
@ -273,14 +278,14 @@ let mkfloat f = `Assoc [("F", `Float f)]
|
|||
let mkint i = `Assoc [("I", `Int i)]
|
||||
let mkname n = `Assoc [("N", `String n)]
|
||||
|
||||
let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = function
|
||||
let rec json_of_object ?(utf8=false) ?(clean_strings=false) pdf fcs ~no_stream_data ~parse_content = function
|
||||
| P.Null -> `Null
|
||||
| P.Boolean b -> `Bool b
|
||||
| P.Integer i -> mkint i
|
||||
| P.Real r -> mkfloat r
|
||||
| P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else s)
|
||||
| P.Name n -> mkname n
|
||||
| P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data pcs) objs)
|
||||
| P.Array objs -> `List (map (json_of_object pdf fcs ~no_stream_data ~parse_content) objs)
|
||||
| P.Dictionary elts ->
|
||||
iter
|
||||
(function
|
||||
|
@ -292,11 +297,11 @@ let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = funct
|
|||
| ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts
|
||||
| _ -> ())
|
||||
elts;
|
||||
`Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs no_stream_data pcs v)) elts)
|
||||
`Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs ~no_stream_data ~parse_content v)) elts)
|
||||
| P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream ->
|
||||
P.getstream thestream;
|
||||
let str, dict' =
|
||||
match P.lookup_direct pdf "/FunctionType" d, pcs with
|
||||
match P.lookup_direct pdf "/FunctionType" d, parse_content with
|
||||
| Some _, true ->
|
||||
Pdfcodec.decode_pdfstream_until_unknown pdf thestream;
|
||||
let dict = P.remove_dict_entry d "/Filter" in
|
||||
|
@ -305,7 +310,7 @@ let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = funct
|
|||
if no_stream_data then ("<<stream data elided>>", d) else
|
||||
match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, d) | _ -> error "failure: toget"
|
||||
in
|
||||
json_of_object pdf fcs no_stream_data pcs (P.Dictionary [("S", P.Array [dict'; P.String str])])
|
||||
json_of_object pdf fcs ~no_stream_data ~parse_content (P.Dictionary [("S", P.Array [dict'; P.String str])])
|
||||
| P.Stream _ -> error "error: stream with not-a-dictionary"
|
||||
| P.Indirect i ->
|
||||
begin match P.lookup_obj pdf i with
|
||||
|
@ -350,7 +355,7 @@ let json_of_op pdf no_stream_data = function
|
|||
`List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "k"]
|
||||
| O.Op_m (a, b) -> `List [mkfloat a; mkfloat b; `String "m"]
|
||||
| O.Op_l (a, b) -> `List [mkfloat a; mkfloat b; `String "l"]
|
||||
| O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data false obj; `String "BDC"]
|
||||
| O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "BDC"]
|
||||
| O.Op_gs s -> `List [`String s; `String "gs"]
|
||||
| O.Op_Do s -> `List [`String s; `String "Do"]
|
||||
| O.Op_CS s -> `List [`String s; `String "CS"]
|
||||
|
@ -395,7 +400,7 @@ let json_of_op pdf no_stream_data = function
|
|||
mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f;
|
||||
`String "Tm"]
|
||||
| O.Op_Tj s -> `List [`String s; `String "Tj"]
|
||||
| O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) no_stream_data false pdfobject; `String "TJ"]
|
||||
| O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false pdfobject; `String "TJ"]
|
||||
| O.Op_' s -> `List [`String s; `String "'"]
|
||||
| O.Op_'' (k, k', s) -> `List [mkfloat k; mkfloat k'; `String s; `String "''"]
|
||||
| O.Op_d0 (k, k') -> `List [mkfloat k; mkfloat k'; `String "d0"]
|
||||
|
@ -421,9 +426,9 @@ let json_of_op pdf no_stream_data = function
|
|||
| O.Op_scnName (s, fs) ->
|
||||
`List (map (fun x -> mkfloat x) fs @ [`String s; `String "scnName"])
|
||||
| O.InlineImage (dict, data) ->
|
||||
`List [json_of_object pdf (fun _ -> ()) no_stream_data false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"]
|
||||
`List [json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"]
|
||||
| O.Op_DP (s, obj) ->
|
||||
`List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data false obj; `String "DP"]
|
||||
`List [`String s; json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "DP"]
|
||||
|
||||
(* parse_stream needs pdf and resources. These are for lexing of inline images,
|
||||
* looking up the colourspace. *)
|
||||
|
@ -475,9 +480,9 @@ let json_of_pdf
|
|||
(fun _ obj -> match obj with Pdf.Stream _ -> Pdfcodec.decode_pdfstream_until_unknown pdf obj | _ -> ())
|
||||
pdf;
|
||||
Pdf.remove_unreferenced pdf;
|
||||
let trailerdict = (0, json_of_object pdf (fun x -> ()) no_stream_data false pdf.P.trailerdict) in
|
||||
let trailerdict = (0, json_of_object pdf (fun x -> ()) ~no_stream_data ~parse_content:false pdf.P.trailerdict) in
|
||||
let parameters =
|
||||
(-1, json_of_object pdf (fun x -> ()) false false
|
||||
(-1, json_of_object pdf (fun x -> ()) ~no_stream_data:false ~parse_content:false
|
||||
(Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2);
|
||||
("/CPDFJSONcontentparsed", Pdf.Boolean parse_content);
|
||||
("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data));
|
||||
|
@ -494,7 +499,7 @@ let json_of_pdf
|
|||
let ps = ref [] in
|
||||
P.objiter
|
||||
(fun i pdfobj ->
|
||||
ps := (i, json_of_object pdf fcs no_stream_data parse_content pdfobj)::!ps)
|
||||
ps := (i, json_of_object pdf fcs ~no_stream_data ~parse_content pdfobj)::!ps)
|
||||
pdf;
|
||||
parameters::trailerdict::sort compare !ps
|
||||
in
|
||||
|
@ -529,7 +534,7 @@ let json_of_pdf
|
|||
(fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj])
|
||||
pairs_parsed)
|
||||
|
||||
let to_output o ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf =
|
||||
let to_output o ?(utf8=false) ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf =
|
||||
let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in
|
||||
match o.Pdfio.out_caml_channel with
|
||||
| Some ch -> J.pretty_to_channel ch json
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
streams, [no_stream_data] will omit stream data, [decompress_streams]
|
||||
decompresses all streams, [clean_strings] will convert any UTF16BE strings
|
||||
to PDFDocEncoding if it can. *)
|
||||
val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit
|
||||
val to_output : Pdfio.output -> ?utf8:bool -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit
|
||||
|
||||
(** Read a CPDFJSON PDF from an input. /Length entries will be corrected automatically. *)
|
||||
val of_input : Pdfio.input -> Pdf.t
|
||||
|
@ -13,7 +13,7 @@ val of_input : Pdfio.input -> Pdf.t
|
|||
Then the PDF file, then a function which is usually [function _ -> ()], then
|
||||
[no_stream_data] as above, then [parse_content_streams] as above, and
|
||||
finally the object itself. *)
|
||||
val json_of_object : ?clean_strings:bool -> Pdf.t -> (int -> unit) -> bool -> bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t
|
||||
val json_of_object : ?utf8:bool -> ?clean_strings:bool -> Pdf.t -> (int -> unit) -> no_stream_data:bool -> parse_content:bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t
|
||||
|
||||
(** Convert a single CPDFJSON object to a PDF object *)
|
||||
val object_of_json : Cpdfyojson.Safe.t -> Pdf.pdfobject
|
||||
|
|
|
@ -269,7 +269,7 @@ let print_dict_entry pdf key =
|
|||
match Pdf.lookup_direct pdf key d with
|
||||
| Some v ->
|
||||
(* We use a double newline as a separator. *)
|
||||
Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false v));
|
||||
Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v));
|
||||
d
|
||||
| None -> d
|
||||
in
|
||||
|
@ -281,7 +281,7 @@ let get_dict_entries pdf key =
|
|||
let es = ref [] in
|
||||
let f d =
|
||||
match Pdf.lookup_direct pdf key d with
|
||||
| Some v -> es := Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false v::!es; d
|
||||
| Some v -> es := Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v::!es; d
|
||||
| None -> d
|
||||
in
|
||||
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
|
||||
|
|
Loading…
Reference in New Issue