mirror of
https://github.com/johnwhitington/cpdf-source.git
synced 2025-02-17 04:10:48 +01:00
more
This commit is contained in:
parent
7a7f3eba29
commit
7a8a1267e4
@ -74,7 +74,7 @@ let annotations_json_page pdf page pagenum =
|
|||||||
let annot = Pdf.direct pdf annot in
|
let annot = Pdf.direct pdf annot in
|
||||||
let annot = rewrite_destinations pdf annot in
|
let annot = rewrite_destinations pdf annot in
|
||||||
extra := annot::!extra;
|
extra := annot::!extra;
|
||||||
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false annot])
|
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false annot])
|
||||||
annots
|
annots
|
||||||
| _ -> []
|
| _ -> []
|
||||||
|
|
||||||
@ -127,7 +127,7 @@ let postprocess_json pdf objnum_to_serial_map json =
|
|||||||
| `List [`Int pagenum; `Int serial; jo] ->
|
| `List [`Int pagenum; `Int serial; jo] ->
|
||||||
let pdfobj = Cpdfjson.object_of_json jo in
|
let pdfobj = Cpdfjson.object_of_json jo in
|
||||||
let fixed = postprocess_json_pdf objnum_to_serial_map pdf pdfobj in
|
let fixed = postprocess_json_pdf objnum_to_serial_map pdf pdfobj in
|
||||||
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false fixed]
|
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false fixed]
|
||||||
| _ -> assert false)
|
| _ -> assert false)
|
||||||
json
|
json
|
||||||
|
|
||||||
@ -149,13 +149,13 @@ let list_annotations_json range pdf =
|
|||||||
let extra =
|
let extra =
|
||||||
map
|
map
|
||||||
(fun n ->
|
(fun n ->
|
||||||
`List [`Int ~-n; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false (Pdf.lookup_obj pdf n)])
|
`List [`Int ~-n; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false (Pdf.lookup_obj pdf n)])
|
||||||
(setify (flatten (map (Pdf.objects_referenced [] [] pdf) extra)))
|
(setify (flatten (map (Pdf.objects_referenced [] [] pdf) extra)))
|
||||||
in
|
in
|
||||||
let header =
|
let header =
|
||||||
`List
|
`List
|
||||||
[`Int 0;
|
[`Int 0;
|
||||||
Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false
|
Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false
|
||||||
(Pdf.Dictionary ["/CPDFJSONannotformatversion", Pdf.Integer 1])]
|
(Pdf.Dictionary ["/CPDFJSONannotformatversion", Pdf.Integer 1])]
|
||||||
in
|
in
|
||||||
let json = `List ([header] @ json @ extra) in
|
let json = `List ([header] @ json @ extra) in
|
||||||
|
@ -176,8 +176,8 @@ let json_of_target pdf fastrefnums x =
|
|||||||
let a =
|
let a =
|
||||||
Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more)
|
Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more)
|
||||||
in
|
in
|
||||||
Cpdfjson.json_of_object pdf (fun _ -> ()) false false a
|
Cpdfjson.json_of_object pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false a
|
||||||
| x -> Cpdfjson.json_of_object pdf (fun _ -> ()) false false x
|
| x -> Cpdfjson.json_of_object pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false x
|
||||||
|
|
||||||
let output_json_marks output calculate_page_number pdf fastrefnums marks =
|
let output_json_marks output calculate_page_number pdf fastrefnums marks =
|
||||||
let module J = Cpdfyojson.Safe in
|
let module J = Cpdfyojson.Safe in
|
||||||
|
33
cpdfjson.ml
33
cpdfjson.ml
@ -1,17 +1,22 @@
|
|||||||
(* Read and write PDF files in JSON format.
|
(* Read and write PDF files in JSON format.
|
||||||
|
|
||||||
|
Format version 3: adds UTF8 option
|
||||||
|
|
||||||
The file is an array of arrays containing an object number followed by an
|
The file is an array of arrays containing an object number followed by an
|
||||||
object, one for each object in the file and two special ones:
|
object, one for each object in the file and two special ones:
|
||||||
|
|
||||||
Object -1: CPDF's own data with the PDF version number, CPDF JSON format
|
Object -1: CPDF's own data with the PDF version number, CPDF JSON format
|
||||||
number, and flags used when writing (which may be required when reading):
|
number, and flags used when writing (which may be required when reading):
|
||||||
|
|
||||||
o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 2)
|
o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 3)
|
||||||
o /CPDFJSONcontentparsed (boolean, true if content streams have been parsed)
|
o /CPDFJSONcontentparsed (boolean, true if content streams have been parsed)
|
||||||
o /CPDFJSONstreamdataincluded (boolean, true if stream data included. Cannot
|
o /CPDFJSONstreamdataincluded (boolean, true if stream data included. Cannot
|
||||||
round-trip if false).
|
round-trip if false).
|
||||||
o /CPDFJSONmajorpdfversion (CPDFJSON integer)
|
o /CPDFJSONmajorpdfversion (CPDFJSON integer)
|
||||||
o /CPDFJSONminorpdfversion (CPDFJSON integer)
|
o /CPDFJSONminorpdfversion (CPDFJSON integer)
|
||||||
|
o /CPDFJSONisUTF8 (Optional. Format 3. If true, strings are converted to UTF8
|
||||||
|
before conversion to JSON, and converted back to PDFDocEncoding/UTF16BE during
|
||||||
|
converstion to PDF.
|
||||||
|
|
||||||
Object 0: The PDF's trailer dictionary
|
Object 0: The PDF's trailer dictionary
|
||||||
|
|
||||||
@ -273,14 +278,14 @@ let mkfloat f = `Assoc [("F", `Float f)]
|
|||||||
let mkint i = `Assoc [("I", `Int i)]
|
let mkint i = `Assoc [("I", `Int i)]
|
||||||
let mkname n = `Assoc [("N", `String n)]
|
let mkname n = `Assoc [("N", `String n)]
|
||||||
|
|
||||||
let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = function
|
let rec json_of_object ?(utf8=false) ?(clean_strings=false) pdf fcs ~no_stream_data ~parse_content = function
|
||||||
| P.Null -> `Null
|
| P.Null -> `Null
|
||||||
| P.Boolean b -> `Bool b
|
| P.Boolean b -> `Bool b
|
||||||
| P.Integer i -> mkint i
|
| P.Integer i -> mkint i
|
||||||
| P.Real r -> mkfloat r
|
| P.Real r -> mkfloat r
|
||||||
| P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else s)
|
| P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else s)
|
||||||
| P.Name n -> mkname n
|
| P.Name n -> mkname n
|
||||||
| P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data pcs) objs)
|
| P.Array objs -> `List (map (json_of_object pdf fcs ~no_stream_data ~parse_content) objs)
|
||||||
| P.Dictionary elts ->
|
| P.Dictionary elts ->
|
||||||
iter
|
iter
|
||||||
(function
|
(function
|
||||||
@ -292,11 +297,11 @@ let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = funct
|
|||||||
| ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts
|
| ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts
|
||||||
| _ -> ())
|
| _ -> ())
|
||||||
elts;
|
elts;
|
||||||
`Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs no_stream_data pcs v)) elts)
|
`Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs ~no_stream_data ~parse_content v)) elts)
|
||||||
| P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream ->
|
| P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream ->
|
||||||
P.getstream thestream;
|
P.getstream thestream;
|
||||||
let str, dict' =
|
let str, dict' =
|
||||||
match P.lookup_direct pdf "/FunctionType" d, pcs with
|
match P.lookup_direct pdf "/FunctionType" d, parse_content with
|
||||||
| Some _, true ->
|
| Some _, true ->
|
||||||
Pdfcodec.decode_pdfstream_until_unknown pdf thestream;
|
Pdfcodec.decode_pdfstream_until_unknown pdf thestream;
|
||||||
let dict = P.remove_dict_entry d "/Filter" in
|
let dict = P.remove_dict_entry d "/Filter" in
|
||||||
@ -305,7 +310,7 @@ let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = funct
|
|||||||
if no_stream_data then ("<<stream data elided>>", d) else
|
if no_stream_data then ("<<stream data elided>>", d) else
|
||||||
match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, d) | _ -> error "failure: toget"
|
match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, d) | _ -> error "failure: toget"
|
||||||
in
|
in
|
||||||
json_of_object pdf fcs no_stream_data pcs (P.Dictionary [("S", P.Array [dict'; P.String str])])
|
json_of_object pdf fcs ~no_stream_data ~parse_content (P.Dictionary [("S", P.Array [dict'; P.String str])])
|
||||||
| P.Stream _ -> error "error: stream with not-a-dictionary"
|
| P.Stream _ -> error "error: stream with not-a-dictionary"
|
||||||
| P.Indirect i ->
|
| P.Indirect i ->
|
||||||
begin match P.lookup_obj pdf i with
|
begin match P.lookup_obj pdf i with
|
||||||
@ -350,7 +355,7 @@ let json_of_op pdf no_stream_data = function
|
|||||||
`List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "k"]
|
`List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "k"]
|
||||||
| O.Op_m (a, b) -> `List [mkfloat a; mkfloat b; `String "m"]
|
| O.Op_m (a, b) -> `List [mkfloat a; mkfloat b; `String "m"]
|
||||||
| O.Op_l (a, b) -> `List [mkfloat a; mkfloat b; `String "l"]
|
| O.Op_l (a, b) -> `List [mkfloat a; mkfloat b; `String "l"]
|
||||||
| O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data false obj; `String "BDC"]
|
| O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "BDC"]
|
||||||
| O.Op_gs s -> `List [`String s; `String "gs"]
|
| O.Op_gs s -> `List [`String s; `String "gs"]
|
||||||
| O.Op_Do s -> `List [`String s; `String "Do"]
|
| O.Op_Do s -> `List [`String s; `String "Do"]
|
||||||
| O.Op_CS s -> `List [`String s; `String "CS"]
|
| O.Op_CS s -> `List [`String s; `String "CS"]
|
||||||
@ -395,7 +400,7 @@ let json_of_op pdf no_stream_data = function
|
|||||||
mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f;
|
mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f;
|
||||||
`String "Tm"]
|
`String "Tm"]
|
||||||
| O.Op_Tj s -> `List [`String s; `String "Tj"]
|
| O.Op_Tj s -> `List [`String s; `String "Tj"]
|
||||||
| O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) no_stream_data false pdfobject; `String "TJ"]
|
| O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false pdfobject; `String "TJ"]
|
||||||
| O.Op_' s -> `List [`String s; `String "'"]
|
| O.Op_' s -> `List [`String s; `String "'"]
|
||||||
| O.Op_'' (k, k', s) -> `List [mkfloat k; mkfloat k'; `String s; `String "''"]
|
| O.Op_'' (k, k', s) -> `List [mkfloat k; mkfloat k'; `String s; `String "''"]
|
||||||
| O.Op_d0 (k, k') -> `List [mkfloat k; mkfloat k'; `String "d0"]
|
| O.Op_d0 (k, k') -> `List [mkfloat k; mkfloat k'; `String "d0"]
|
||||||
@ -421,9 +426,9 @@ let json_of_op pdf no_stream_data = function
|
|||||||
| O.Op_scnName (s, fs) ->
|
| O.Op_scnName (s, fs) ->
|
||||||
`List (map (fun x -> mkfloat x) fs @ [`String s; `String "scnName"])
|
`List (map (fun x -> mkfloat x) fs @ [`String s; `String "scnName"])
|
||||||
| O.InlineImage (dict, data) ->
|
| O.InlineImage (dict, data) ->
|
||||||
`List [json_of_object pdf (fun _ -> ()) no_stream_data false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"]
|
`List [json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"]
|
||||||
| O.Op_DP (s, obj) ->
|
| O.Op_DP (s, obj) ->
|
||||||
`List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data false obj; `String "DP"]
|
`List [`String s; json_of_object pdf (fun _ -> ()) ~no_stream_data ~parse_content:false obj; `String "DP"]
|
||||||
|
|
||||||
(* parse_stream needs pdf and resources. These are for lexing of inline images,
|
(* parse_stream needs pdf and resources. These are for lexing of inline images,
|
||||||
* looking up the colourspace. *)
|
* looking up the colourspace. *)
|
||||||
@ -475,9 +480,9 @@ let json_of_pdf
|
|||||||
(fun _ obj -> match obj with Pdf.Stream _ -> Pdfcodec.decode_pdfstream_until_unknown pdf obj | _ -> ())
|
(fun _ obj -> match obj with Pdf.Stream _ -> Pdfcodec.decode_pdfstream_until_unknown pdf obj | _ -> ())
|
||||||
pdf;
|
pdf;
|
||||||
Pdf.remove_unreferenced pdf;
|
Pdf.remove_unreferenced pdf;
|
||||||
let trailerdict = (0, json_of_object pdf (fun x -> ()) no_stream_data false pdf.P.trailerdict) in
|
let trailerdict = (0, json_of_object pdf (fun x -> ()) ~no_stream_data ~parse_content:false pdf.P.trailerdict) in
|
||||||
let parameters =
|
let parameters =
|
||||||
(-1, json_of_object pdf (fun x -> ()) false false
|
(-1, json_of_object pdf (fun x -> ()) ~no_stream_data:false ~parse_content:false
|
||||||
(Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2);
|
(Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2);
|
||||||
("/CPDFJSONcontentparsed", Pdf.Boolean parse_content);
|
("/CPDFJSONcontentparsed", Pdf.Boolean parse_content);
|
||||||
("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data));
|
("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data));
|
||||||
@ -494,7 +499,7 @@ let json_of_pdf
|
|||||||
let ps = ref [] in
|
let ps = ref [] in
|
||||||
P.objiter
|
P.objiter
|
||||||
(fun i pdfobj ->
|
(fun i pdfobj ->
|
||||||
ps := (i, json_of_object pdf fcs no_stream_data parse_content pdfobj)::!ps)
|
ps := (i, json_of_object pdf fcs ~no_stream_data ~parse_content pdfobj)::!ps)
|
||||||
pdf;
|
pdf;
|
||||||
parameters::trailerdict::sort compare !ps
|
parameters::trailerdict::sort compare !ps
|
||||||
in
|
in
|
||||||
@ -529,7 +534,7 @@ let json_of_pdf
|
|||||||
(fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj])
|
(fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj])
|
||||||
pairs_parsed)
|
pairs_parsed)
|
||||||
|
|
||||||
let to_output o ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf =
|
let to_output o ?(utf8=false) ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf =
|
||||||
let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in
|
let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in
|
||||||
match o.Pdfio.out_caml_channel with
|
match o.Pdfio.out_caml_channel with
|
||||||
| Some ch -> J.pretty_to_channel ch json
|
| Some ch -> J.pretty_to_channel ch json
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
streams, [no_stream_data] will omit stream data, [decompress_streams]
|
streams, [no_stream_data] will omit stream data, [decompress_streams]
|
||||||
decompresses all streams, [clean_strings] will convert any UTF16BE strings
|
decompresses all streams, [clean_strings] will convert any UTF16BE strings
|
||||||
to PDFDocEncoding if it can. *)
|
to PDFDocEncoding if it can. *)
|
||||||
val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit
|
val to_output : Pdfio.output -> ?utf8:bool -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit
|
||||||
|
|
||||||
(** Read a CPDFJSON PDF from an input. /Length entries will be corrected automatically. *)
|
(** Read a CPDFJSON PDF from an input. /Length entries will be corrected automatically. *)
|
||||||
val of_input : Pdfio.input -> Pdf.t
|
val of_input : Pdfio.input -> Pdf.t
|
||||||
@ -13,7 +13,7 @@ val of_input : Pdfio.input -> Pdf.t
|
|||||||
Then the PDF file, then a function which is usually [function _ -> ()], then
|
Then the PDF file, then a function which is usually [function _ -> ()], then
|
||||||
[no_stream_data] as above, then [parse_content_streams] as above, and
|
[no_stream_data] as above, then [parse_content_streams] as above, and
|
||||||
finally the object itself. *)
|
finally the object itself. *)
|
||||||
val json_of_object : ?clean_strings:bool -> Pdf.t -> (int -> unit) -> bool -> bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t
|
val json_of_object : ?utf8:bool -> ?clean_strings:bool -> Pdf.t -> (int -> unit) -> no_stream_data:bool -> parse_content:bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t
|
||||||
|
|
||||||
(** Convert a single CPDFJSON object to a PDF object *)
|
(** Convert a single CPDFJSON object to a PDF object *)
|
||||||
val object_of_json : Cpdfyojson.Safe.t -> Pdf.pdfobject
|
val object_of_json : Cpdfyojson.Safe.t -> Pdf.pdfobject
|
||||||
|
@ -269,7 +269,7 @@ let print_dict_entry pdf key =
|
|||||||
match Pdf.lookup_direct pdf key d with
|
match Pdf.lookup_direct pdf key d with
|
||||||
| Some v ->
|
| Some v ->
|
||||||
(* We use a double newline as a separator. *)
|
(* We use a double newline as a separator. *)
|
||||||
Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false v));
|
Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v));
|
||||||
d
|
d
|
||||||
| None -> d
|
| None -> d
|
||||||
in
|
in
|
||||||
@ -281,7 +281,7 @@ let get_dict_entries pdf key =
|
|||||||
let es = ref [] in
|
let es = ref [] in
|
||||||
let f d =
|
let f d =
|
||||||
match Pdf.lookup_direct pdf key d with
|
match Pdf.lookup_direct pdf key d with
|
||||||
| Some v -> es := Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false v::!es; d
|
| Some v -> es := Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v::!es; d
|
||||||
| None -> d
|
| None -> d
|
||||||
in
|
in
|
||||||
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
|
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user