yojson writing
This commit is contained in:
parent
f9e58e6d7e
commit
b671b637bd
482
cpdfjson.ml
482
cpdfjson.ml
|
@ -2,13 +2,8 @@ open Pdfutil
|
||||||
open Cpdferror
|
open Cpdferror
|
||||||
|
|
||||||
(*module J = Tjjson
|
(*module J = Tjjson
|
||||||
module P = Pdf
|
|
||||||
module O = Pdfops
|
|
||||||
|
|
||||||
let sof = Printf.sprintf "%f" (* To prevent "0." *)
|
|
||||||
let soi = string_of_int
|
|
||||||
let string_of_float _ = error "use sof"
|
|
||||||
let string_of_int _ = error "use soi"
|
|
||||||
|
|
||||||
let opf = function
|
let opf = function
|
||||||
| J.Object ["F", J.Number f] -> float_of_string f
|
| J.Object ["F", J.Number f] -> float_of_string f
|
||||||
|
@ -129,238 +124,8 @@ and object_of_json = function
|
||||||
Pdfops.stream_of_ops (List.map op_of_json parsed_ops)
|
Pdfops.stream_of_ops (List.map op_of_json parsed_ops)
|
||||||
| J.Object elts -> P.Dictionary (map (fun (n, o) -> (n, object_of_json o)) elts)
|
| J.Object elts -> P.Dictionary (map (fun (n, o) -> (n, object_of_json o)) elts)
|
||||||
|
|
||||||
let rec json_of_object pdf fcs no_stream_data = function
|
|
||||||
| P.Null -> J.Null
|
|
||||||
| P.Boolean b -> J.Bool b
|
|
||||||
| P.Integer i -> J.Object [("I", J.Number (soi i))]
|
|
||||||
| P.Real r -> J.Object [("F", J.Number (sof r))]
|
|
||||||
| P.String s -> J.String s
|
|
||||||
| P.Name n -> J.Object [("N", J.String n)]
|
|
||||||
| P.Array objs -> J.Array (map (json_of_object pdf fcs no_stream_data) objs)
|
|
||||||
| P.Dictionary elts ->
|
|
||||||
iter
|
|
||||||
(function
|
|
||||||
("/Contents", P.Indirect i) ->
|
|
||||||
begin match Pdf.lookup_obj pdf i with
|
|
||||||
| Pdf.Array is -> iter (function Pdf.Indirect i -> fcs i | _ -> ()) is
|
|
||||||
| _ -> fcs i
|
|
||||||
end
|
|
||||||
| ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts
|
|
||||||
| _ -> ())
|
|
||||||
elts;
|
|
||||||
J.Object (map (fun (k, v) -> (k, json_of_object pdf fcs no_stream_data v)) elts)
|
|
||||||
| P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream ->
|
|
||||||
P.getstream thestream;
|
|
||||||
let str =
|
|
||||||
begin match P.lookup_direct pdf "/FunctionType" d with
|
|
||||||
| Some _ ->
|
|
||||||
Pdfcodec.decode_pdfstream_until_unknown pdf thestream;
|
|
||||||
begin match !mut with (_, P.Got b) -> Pdfio.string_of_bytes b | _ -> "failure: decomp" end
|
|
||||||
| None ->
|
|
||||||
if no_stream_data then "<<stream data elided>>" else
|
|
||||||
match stream with P.Got b -> Pdfio.string_of_bytes b | P.ToGet _ -> "failure: toget"
|
|
||||||
end
|
|
||||||
in
|
|
||||||
json_of_object pdf fcs no_stream_data (P.Dictionary [("S", P.Array [P.Dictionary dict; P.String str])])
|
|
||||||
| P.Stream _ -> J.String "error: stream with not-a-dictionary"
|
|
||||||
| P.Indirect i ->
|
|
||||||
begin match P.lookup_obj pdf i with
|
|
||||||
| P.Stream {contents = (P.Dictionary dict as d, _)} ->
|
|
||||||
begin match P.lookup_direct pdf "/Subtype" d with
|
|
||||||
| Some (P.Name "/Form") -> fcs i
|
|
||||||
| _ -> ()
|
|
||||||
end
|
|
||||||
| _ -> ()
|
|
||||||
end;
|
|
||||||
J.Number (soi i)
|
|
||||||
|
|
||||||
let json_of_op pdf no_stream_data = function
|
|
||||||
| O.Op_S -> J.Array [J.String "S"]
|
|
||||||
| O.Op_s -> J.Array [J.String "s"]
|
|
||||||
| O.Op_f -> J.Array [J.String "f"]
|
|
||||||
| O.Op_F -> J.Array [J.String "F"]
|
|
||||||
| O.Op_f' ->J.Array [J.String "f*"]
|
|
||||||
| O.Op_B -> J.Array [J.String "B"]
|
|
||||||
| O.Op_B' -> J.Array [J.String "B*"]
|
|
||||||
| O.Op_b -> J.Array [J.String "b"]
|
|
||||||
| O.Op_b' -> J.Array [J.String "b*"]
|
|
||||||
| O.Op_n -> J.Array [J.String "n"]
|
|
||||||
| O.Op_W -> J.Array [J.String "W"]
|
|
||||||
| O.Op_W' -> J.Array [J.String "W*"]
|
|
||||||
| O.Op_BT -> J.Array [J.String "BT"]
|
|
||||||
| O.Op_ET -> J.Array [J.String "ET"]
|
|
||||||
| O.Op_q -> J.Array [J.String "q"]
|
|
||||||
| O.Op_Q -> J.Array [J.String "Q"]
|
|
||||||
| O.Op_h -> J.Array [J.String "h"]
|
|
||||||
| O.Op_T' -> J.Array [J.String "T*"]
|
|
||||||
| O.Op_EMC -> J.Array [J.String "EMC"]
|
|
||||||
| O.Op_BX -> J.Array [J.String "BX"]
|
|
||||||
| O.Op_EX -> J.Array [J.String "EX"]
|
|
||||||
| O.Op_re (a, b, c, d) ->
|
|
||||||
J.Array [J.Number (sof a); J.Number (sof b); J.Number (sof c); J.Number (sof d); J.String "re"]
|
|
||||||
| O.Op_k (c, m, y, k) ->
|
|
||||||
J.Array [J.Number (sof c); J.Number (sof m); J.Number (sof y); J.Number (sof k); J.String "k"]
|
|
||||||
| O.Op_m (a, b) -> J.Array [J.Number (sof a); J.Number (sof b); J.String "m"]
|
|
||||||
| O.Op_l (a, b) -> J.Array [J.Number (sof a); J.Number (sof b); J.String "l"]
|
|
||||||
| O.Op_BDC (s, obj) -> J.Array [J.String s; json_of_object pdf (fun _ -> ()) no_stream_data obj; J.String "BDC"]
|
|
||||||
| O.Op_gs s -> J.Array [J.String s; J.String "gs"]
|
|
||||||
| O.Op_Do s -> J.Array [J.String s; J.String "Do"]
|
|
||||||
| O.Op_CS s -> J.Array [J.String s; J.String "CS"]
|
|
||||||
| O.Op_SCN fs -> J.Array ((map (fun x -> J.Number (sof x)) fs) @ [J.String "SCN"])
|
|
||||||
| O.Op_j j -> J.Array [J.Number (soi j); J.String "j"]
|
|
||||||
| O.Op_cm t ->
|
|
||||||
J.Array
|
|
||||||
[J.Number (sof t.Pdftransform.a); J.Number (sof t.Pdftransform.b); J.Number (sof t.Pdftransform.c);
|
|
||||||
J.Number (sof t.Pdftransform.d); J.Number (sof t.Pdftransform.e); J.Number (sof t.Pdftransform.f);
|
|
||||||
J.String "cm"]
|
|
||||||
| O.Op_d (fl, y) ->
|
|
||||||
J.Array [J.Array (map (fun x -> J.Number (sof x)) fl); J.Number (sof y); J.String "d"]
|
|
||||||
| O.Op_w w -> J.Array [J.Number (sof w); J.String "w"]
|
|
||||||
| O.Op_J j -> J.Array [J.Number (soi j); J.String "J"]
|
|
||||||
| O.Op_M m -> J.Array [J.Number (sof m); J.String "M"]
|
|
||||||
| O.Op_ri s -> J.Array [J.String s; J.String "ri"]
|
|
||||||
| O.Op_i i -> J.Array [J.Number (soi i); J.String "i"]
|
|
||||||
| O.Op_c (a, b, c, d, e, k) ->
|
|
||||||
J.Array
|
|
||||||
[J.Number (sof a); J.Number (sof b); J.Number (sof c);
|
|
||||||
J.Number (sof d); J.Number (sof e); J.Number (sof k); J.String "c"]
|
|
||||||
| O.Op_v (a, b, c, d) ->
|
|
||||||
J.Array
|
|
||||||
[J.Number (sof a); J.Number (sof b); J.Number (sof c);
|
|
||||||
J.Number (sof d); J.String "v"]
|
|
||||||
| O.Op_y (a, b, c, d) ->
|
|
||||||
J.Array
|
|
||||||
[J.Number (sof a); J.Number (sof b); J.Number (sof c);
|
|
||||||
J.Number (sof d); J.String "y"]
|
|
||||||
| O.Op_Tc c -> J.Array [J.Number (sof c); J.String "Tc"]
|
|
||||||
| O.Op_Tw w -> J.Array [J.Number (sof w); J.String "Tw"]
|
|
||||||
| O.Op_Tz z -> J.Array [J.Number (sof z); J.String "Tz"]
|
|
||||||
| O.Op_TL l -> J.Array [J.Number (sof l); J.String "TL"]
|
|
||||||
| O.Op_Tf (k, s) -> J.Array [J.String k; J.Number (sof s); J.String "Tf"]
|
|
||||||
| O.Op_Tr i -> J.Array [J.Number (soi i); J.String "Tr"]
|
|
||||||
| O.Op_Ts k -> J.Array [J.Number (sof k); J.String "Ts"]
|
|
||||||
| O.Op_Td (k, k') -> J.Array [J.Number (sof k); J.Number (sof k'); J.String "Td"]
|
|
||||||
| O.Op_TD (k, k') -> J.Array [J.Number (sof k); J.Number (sof k'); J.String "TD"]
|
|
||||||
| O.Op_Tm t ->
|
|
||||||
J.Array
|
|
||||||
[J.Number (sof t.Pdftransform.a); J.Number (sof t.Pdftransform.b); J.Number (sof t.Pdftransform.c);
|
|
||||||
J.Number (sof t.Pdftransform.d); J.Number (sof t.Pdftransform.e); J.Number (sof t.Pdftransform.f);
|
|
||||||
J.String "Tm"]
|
|
||||||
| O.Op_Tj s -> J.Array [J.String s; J.String "Tj"]
|
|
||||||
| O.Op_TJ pdfobject -> J.Array [json_of_object pdf (fun _ -> ()) no_stream_data pdfobject; J.String "TJ"]
|
|
||||||
| O.Op_' s -> J.Array [J.String s; J.String "'"]
|
|
||||||
| O.Op_'' (k, k', s) -> J.Array [J.Number (sof k); J.Number (sof k'); J.String s; J.String "''"]
|
|
||||||
| O.Op_d0 (k, k') -> J.Array [J.Number (sof k); J.Number (sof k'); J.String "d0"]
|
|
||||||
| O.Op_d1 (a, b, c, d, e, k) ->
|
|
||||||
J.Array
|
|
||||||
[J.Number (sof a); J.Number (sof b); J.Number (sof c);
|
|
||||||
J.Number (sof d); J.Number (sof e); J.Number (sof k); J.String "d1"]
|
|
||||||
| O.Op_cs s -> J.Array [J.String s; J.String "cs"]
|
|
||||||
| O.Op_SC fs -> J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String "SC"])
|
|
||||||
| O.Op_sc fs -> J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String "sc"])
|
|
||||||
| O.Op_scn fs -> J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String "scn"])
|
|
||||||
| O.Op_G k -> J.Array [J.Number (sof k); J.String "G"]
|
|
||||||
| O.Op_g k -> J.Array [J.Number (sof k); J.String "g"]
|
|
||||||
| O.Op_RG (r, g, b) -> J.Array [J.Number (sof r); J.Number (sof g); J.Number (sof b); J.String "RG"]
|
|
||||||
| O.Op_rg (r, g, b) -> J.Array [J.Number (sof r); J.Number (sof g); J.Number (sof b); J.String "rg"]
|
|
||||||
| O.Op_K (c, m, y, k) -> J.Array [J.Number (sof c); J.Number (sof m); J.Number (sof y); J.Number (sof k); J.String "K"]
|
|
||||||
| O.Op_sh s -> J.Array [J.String s; J.String "sh"]
|
|
||||||
| O.Op_MP s -> J.Array [J.String s; J.String "MP"]
|
|
||||||
| O.Op_BMC s -> J.Array [J.String s; J.String "BMC"]
|
|
||||||
| O.Op_Unknown s -> J.Array [J.String s; J.String "Unknown"]
|
|
||||||
| O.Op_SCNName (s, fs) ->
|
|
||||||
J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String s; J.String "SCNName"])
|
|
||||||
| O.Op_scnName (s, fs) ->
|
|
||||||
J.Array (map (fun x -> J.Number (sof x)) fs @ [J.String s; J.String "scnName"])
|
|
||||||
| O.InlineImage (dict, data) -> J.Array [json_of_object pdf (fun _ -> ()) no_stream_data dict; J.String (Pdfio.string_of_bytes data); J.String "InlineImage"]
|
|
||||||
| O.Op_DP (s, obj) -> J.Array [J.String s; json_of_object pdf (fun _ -> ()) no_stream_data obj; J.String "DP"]
|
|
||||||
|
|
||||||
(* parse_stream needs pdf and resources. These are for lexing of inline images,
|
|
||||||
* looking up the colourspace. We do not need to worry about inherited
|
|
||||||
* resources, though? For now, don't worry about inherited resources: check in
|
|
||||||
* PDF standard. *)
|
|
||||||
|
|
||||||
let parse_content_stream pdf resources bs =
|
|
||||||
let ops = O.parse_stream pdf resources [bs] in
|
|
||||||
J.Array (map (json_of_op pdf false) ops)
|
|
||||||
|
|
||||||
(* We need to make sure each page only has one page content stream. Otherwise,
|
|
||||||
if not split on op boundaries, each one would fail to parse on its own. *)
|
|
||||||
(* Future improvement. Don't blow up shared content streams. *)
|
|
||||||
let precombine_page_content pdf =
|
|
||||||
let pages' =
|
|
||||||
map
|
|
||||||
(fun page ->
|
|
||||||
match page.Pdfpage.content with
|
|
||||||
[] | [_] -> page
|
|
||||||
| _ ->
|
|
||||||
let operators =
|
|
||||||
Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
|
|
||||||
in
|
|
||||||
{page with Pdfpage.content = [Pdfops.stream_of_ops operators]}
|
|
||||||
)
|
|
||||||
(Pdfpage.pages_of_pagetree pdf)
|
|
||||||
in
|
|
||||||
Pdfpage.change_pages true pdf pages'
|
|
||||||
|
|
||||||
let json_of_pdf parse_content no_stream_data decompress_streams pdf =
|
|
||||||
let pdf = if parse_content then precombine_page_content pdf else pdf in
|
|
||||||
if decompress_streams then
|
|
||||||
Pdf.objiter
|
|
||||||
(fun n obj ->
|
|
||||||
Printf.eprintf "obj %i\n" n;
|
|
||||||
match obj with
|
|
||||||
| Pdf.Stream _ -> Printf.eprintf "decompressing...\n"; Pdfcodec.decode_pdfstream_until_unknown pdf obj
|
|
||||||
| _ -> ())
|
|
||||||
pdf;
|
|
||||||
Pdf.remove_unreferenced pdf;
|
|
||||||
let trailerdict = (0, json_of_object pdf (fun x -> ()) no_stream_data pdf.P.trailerdict) in
|
|
||||||
let parameters =
|
|
||||||
(-1, json_of_object pdf (fun x -> ()) false
|
|
||||||
(Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2);
|
|
||||||
("/CPDFJSONcontentparsed", Pdf.Boolean parse_content);
|
|
||||||
("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data));
|
|
||||||
("/CPDFJSONmajorpdfversion", Pdf.Integer pdf.Pdf.major);
|
|
||||||
("/CPDFJSONminorpdfversion", Pdf.Integer pdf.Pdf.minor);
|
|
||||||
]))
|
|
||||||
in
|
|
||||||
let content_streams = ref [] in
|
|
||||||
let fcs n =
|
|
||||||
content_streams := n::!content_streams;
|
|
||||||
if parse_content then Pdfcodec.decode_pdfstream_until_unknown pdf (P.lookup_obj pdf n)
|
|
||||||
in
|
|
||||||
let pairs =
|
|
||||||
let ps = ref [] in
|
|
||||||
P.objiter
|
|
||||||
(fun i pdfobj ->
|
|
||||||
ps := (i, json_of_object pdf fcs no_stream_data pdfobj)::!ps)
|
|
||||||
pdf;
|
|
||||||
parameters::trailerdict::!ps
|
|
||||||
in
|
|
||||||
let pairs_parsed =
|
|
||||||
if not parse_content then pairs else
|
|
||||||
map
|
|
||||||
(fun (objnum, obj) ->
|
|
||||||
if mem objnum !content_streams then
|
|
||||||
begin match obj with
|
|
||||||
| J.Object ["S", J.Array [dict; J.String _]] ->
|
|
||||||
(* FIXME Proper resources here for reasons explained above? *)
|
|
||||||
let streamdata =
|
|
||||||
match P.lookup_obj pdf objnum with
|
|
||||||
| P.Stream {contents = (_, P.Got b)} -> b
|
|
||||||
| _ -> error "JSON: stream not decoded"
|
|
||||||
in
|
|
||||||
(objnum, J.Object ["S", J.Array [dict; parse_content_stream pdf (P.Dictionary []) streamdata]])
|
|
||||||
| _ -> error "json_of_pdf: stream parsing inconsistency"
|
|
||||||
end
|
|
||||||
else
|
|
||||||
(objnum, obj))
|
|
||||||
pairs
|
|
||||||
in
|
|
||||||
J.Array
|
|
||||||
(map
|
|
||||||
(fun (objnum, jsonobj) -> J.Array [J.Number (soi objnum); jsonobj])
|
|
||||||
pairs_parsed)
|
|
||||||
|
|
||||||
let pdf_of_json json =
|
let pdf_of_json json =
|
||||||
(*flprint (J.show json); flprint "\n";*)
|
(*flprint (J.show json); flprint "\n";*)
|
||||||
|
@ -427,7 +192,250 @@ let to_output o parse_content no_stream_data decompress_streams pdf =
|
||||||
o.Pdfio.output_string (Buffer.contents b)
|
o.Pdfio.output_string (Buffer.contents b)
|
||||||
*)
|
*)
|
||||||
|
|
||||||
let to_output _ _ _ _ _ = ()
|
module J = Cpdfyojson
|
||||||
|
module P = Pdf
|
||||||
|
module O = Pdfops
|
||||||
|
|
||||||
|
let rec json_of_object pdf fcs no_stream_data = function
|
||||||
|
| P.Null -> `Null
|
||||||
|
| P.Boolean b -> `Bool b
|
||||||
|
| P.Integer i -> `Assoc [("I", `Int i)]
|
||||||
|
| P.Real r -> `Assoc [("F", `Float r)]
|
||||||
|
| P.String s -> `String s
|
||||||
|
| P.Name n -> `Assoc [("N", `String n)]
|
||||||
|
| P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data) objs)
|
||||||
|
| P.Dictionary elts ->
|
||||||
|
iter
|
||||||
|
(function
|
||||||
|
("/Contents", P.Indirect i) ->
|
||||||
|
begin match Pdf.lookup_obj pdf i with
|
||||||
|
| Pdf.Array is -> iter (function Pdf.Indirect i -> fcs i | _ -> ()) is
|
||||||
|
| _ -> fcs i
|
||||||
|
end
|
||||||
|
| ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts
|
||||||
|
| _ -> ())
|
||||||
|
elts;
|
||||||
|
`Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs no_stream_data v)) elts)
|
||||||
|
| P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream ->
|
||||||
|
P.getstream thestream;
|
||||||
|
let str =
|
||||||
|
begin match P.lookup_direct pdf "/FunctionType" d with
|
||||||
|
| Some _ ->
|
||||||
|
Pdfcodec.decode_pdfstream_until_unknown pdf thestream;
|
||||||
|
begin match !mut with (_, P.Got b) -> Pdfio.string_of_bytes b | _ -> "failure: decomp" end
|
||||||
|
| None ->
|
||||||
|
if no_stream_data then "<<stream data elided>>" else
|
||||||
|
match stream with P.Got b -> Pdfio.string_of_bytes b | P.ToGet _ -> "failure: toget"
|
||||||
|
end
|
||||||
|
in
|
||||||
|
json_of_object pdf fcs no_stream_data (P.Dictionary [("S", P.Array [P.Dictionary dict; P.String str])])
|
||||||
|
| P.Stream _ -> error "error: stream with not-a-dictionary"
|
||||||
|
| P.Indirect i ->
|
||||||
|
begin match P.lookup_obj pdf i with
|
||||||
|
| P.Stream {contents = (P.Dictionary dict as d, _)} ->
|
||||||
|
begin match P.lookup_direct pdf "/Subtype" d with
|
||||||
|
| Some (P.Name "/Form") -> fcs i
|
||||||
|
| _ -> ()
|
||||||
|
end
|
||||||
|
| _ -> ()
|
||||||
|
end;
|
||||||
|
`Int i
|
||||||
|
|
||||||
|
let json_of_op pdf no_stream_data = function
|
||||||
|
| O.Op_S -> `List [`String "S"]
|
||||||
|
| O.Op_s -> `List [`String "s"]
|
||||||
|
| O.Op_f -> `List [`String "f"]
|
||||||
|
| O.Op_F -> `List [`String "F"]
|
||||||
|
| O.Op_f' ->`List [`String "f*"]
|
||||||
|
| O.Op_B -> `List [`String "B"]
|
||||||
|
| O.Op_B' -> `List [`String "B*"]
|
||||||
|
| O.Op_b -> `List [`String "b"]
|
||||||
|
| O.Op_b' -> `List [`String "b*"]
|
||||||
|
| O.Op_n -> `List [`String "n"]
|
||||||
|
| O.Op_W -> `List [`String "W"]
|
||||||
|
| O.Op_W' -> `List [`String "W*"]
|
||||||
|
| O.Op_BT -> `List [`String "BT"]
|
||||||
|
| O.Op_ET -> `List [`String "ET"]
|
||||||
|
| O.Op_q -> `List [`String "q"]
|
||||||
|
| O.Op_Q -> `List [`String "Q"]
|
||||||
|
| O.Op_h -> `List [`String "h"]
|
||||||
|
| O.Op_T' -> `List [`String "T*"]
|
||||||
|
| O.Op_EMC -> `List [`String "EMC"]
|
||||||
|
| O.Op_BX -> `List [`String "BX"]
|
||||||
|
| O.Op_EX -> `List [`String "EX"]
|
||||||
|
| O.Op_re (a, b, c, d) ->
|
||||||
|
`List [`Float a; `Float b; `Float c; `Float d; `String "re"]
|
||||||
|
| O.Op_k (c, m, y, k) ->
|
||||||
|
`List [`Float c; `Float m; `Float y; `Float k; `String "k"]
|
||||||
|
| O.Op_m (a, b) -> `List [`Float a; `Float b; `String "m"]
|
||||||
|
| O.Op_l (a, b) -> `List [`Float a; `Float b; `String "l"]
|
||||||
|
| O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data obj; `String "BDC"]
|
||||||
|
| O.Op_gs s -> `List [`String s; `String "gs"]
|
||||||
|
| O.Op_Do s -> `List [`String s; `String "Do"]
|
||||||
|
| O.Op_CS s -> `List [`String s; `String "CS"]
|
||||||
|
| O.Op_SCN fs -> `List ((map (fun x -> `Float x) fs) @ [`String "SCN"])
|
||||||
|
| O.Op_j j -> `List [`Int j; `String "j"]
|
||||||
|
| O.Op_cm t ->
|
||||||
|
`List
|
||||||
|
[`Float t.Pdftransform.a; `Float t.Pdftransform.b; `Float t.Pdftransform.c;
|
||||||
|
`Float t.Pdftransform.d; `Float t.Pdftransform.e; `Float t.Pdftransform.f;
|
||||||
|
`String "cm"]
|
||||||
|
| O.Op_d (fl, y) ->
|
||||||
|
`List [`List (map (fun x -> `Float x) fl); `Float y; `String "d"]
|
||||||
|
| O.Op_w w -> `List [`Float w; `String "w"]
|
||||||
|
| O.Op_J j -> `List [`Int j; `String "J"]
|
||||||
|
| O.Op_M m -> `List [`Float m; `String "M"]
|
||||||
|
| O.Op_ri s -> `List [`String s; `String "ri"]
|
||||||
|
| O.Op_i i -> `List [`Int i; `String "i"]
|
||||||
|
| O.Op_c (a, b, c, d, e, k) ->
|
||||||
|
`List
|
||||||
|
[`Float a; `Float b; `Float c;
|
||||||
|
`Float d; `Float e; `Float k; `String "c"]
|
||||||
|
| O.Op_v (a, b, c, d) ->
|
||||||
|
`List
|
||||||
|
[`Float a; `Float b; `Float c;
|
||||||
|
`Float d; `String "v"]
|
||||||
|
| O.Op_y (a, b, c, d) ->
|
||||||
|
`List
|
||||||
|
[`Float a; `Float b; `Float c;
|
||||||
|
`Float d; `String "y"]
|
||||||
|
| O.Op_Tc c -> `List [`Float c; `String "Tc"]
|
||||||
|
| O.Op_Tw w -> `List [`Float w; `String "Tw"]
|
||||||
|
| O.Op_Tz z -> `List [`Float z; `String "Tz"]
|
||||||
|
| O.Op_TL l -> `List [`Float l; `String "TL"]
|
||||||
|
| O.Op_Tf (k, s) -> `List [`String k; `Float s; `String "Tf"]
|
||||||
|
| O.Op_Tr i -> `List [`Int i; `String "Tr"]
|
||||||
|
| O.Op_Ts k -> `List [`Float k; `String "Ts"]
|
||||||
|
| O.Op_Td (k, k') -> `List [`Float k; `Float k'; `String "Td"]
|
||||||
|
| O.Op_TD (k, k') -> `List [`Float k; `Float k'; `String "TD"]
|
||||||
|
| O.Op_Tm t ->
|
||||||
|
`List
|
||||||
|
[`Float t.Pdftransform.a; `Float t.Pdftransform.b; `Float t.Pdftransform.c;
|
||||||
|
`Float t.Pdftransform.d; `Float t.Pdftransform.e; `Float t.Pdftransform.f;
|
||||||
|
`String "Tm"]
|
||||||
|
| O.Op_Tj s -> `List [`String s; `String "Tj"]
|
||||||
|
| O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) no_stream_data pdfobject; `String "TJ"]
|
||||||
|
| O.Op_' s -> `List [`String s; `String "'"]
|
||||||
|
| O.Op_'' (k, k', s) -> `List [`Float k; `Float k'; `String s; `String "''"]
|
||||||
|
| O.Op_d0 (k, k') -> `List [`Float k; `Float k'; `String "d0"]
|
||||||
|
| O.Op_d1 (a, b, c, d, e, k) ->
|
||||||
|
`List
|
||||||
|
[`Float a; `Float b; `Float c;
|
||||||
|
`Float d; `Float e; `Float k; `String "d1"]
|
||||||
|
| O.Op_cs s -> `List [`String s; `String "cs"]
|
||||||
|
| O.Op_SC fs -> `List (map (fun x -> `Float x) fs @ [`String "SC"])
|
||||||
|
| O.Op_sc fs -> `List (map (fun x -> `Float x) fs @ [`String "sc"])
|
||||||
|
| O.Op_scn fs -> `List (map (fun x -> `Float x) fs @ [`String "scn"])
|
||||||
|
| O.Op_G k -> `List [`Float k; `String "G"]
|
||||||
|
| O.Op_g k -> `List [`Float k; `String "g"]
|
||||||
|
| O.Op_RG (r, g, b) -> `List [`Float r; `Float g; `Float b; `String "RG"]
|
||||||
|
| O.Op_rg (r, g, b) -> `List [`Float r; `Float g; `Float b; `String "rg"]
|
||||||
|
| O.Op_K (c, m, y, k) -> `List [`Float c; `Float m; `Float y; `Float k; `String "K"]
|
||||||
|
| O.Op_sh s -> `List [`String s; `String "sh"]
|
||||||
|
| O.Op_MP s -> `List [`String s; `String "MP"]
|
||||||
|
| O.Op_BMC s -> `List [`String s; `String "BMC"]
|
||||||
|
| O.Op_Unknown s -> `List [`String s; `String "Unknown"]
|
||||||
|
| O.Op_SCNName (s, fs) ->
|
||||||
|
`List (map (fun x -> `Float x) fs @ [`String s; `String "SCNName"])
|
||||||
|
| O.Op_scnName (s, fs) ->
|
||||||
|
`List (map (fun x -> `Float x) fs @ [`String s; `String "scnName"])
|
||||||
|
| O.InlineImage (dict, data) ->
|
||||||
|
`List [json_of_object pdf (fun _ -> ()) no_stream_data dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"]
|
||||||
|
| O.Op_DP (s, obj) ->
|
||||||
|
`List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data obj; `String "DP"]
|
||||||
|
|
||||||
|
|
||||||
|
(* parse_stream needs pdf and resources. These are for lexing of inline images,
|
||||||
|
* looking up the colourspace. We do not need to worry about inherited
|
||||||
|
* resources, though? For now, don't worry about inherited resources: check in
|
||||||
|
* PDF standard. *)
|
||||||
|
|
||||||
|
let parse_content_stream pdf resources bs =
|
||||||
|
let ops = O.parse_stream pdf resources [bs] in
|
||||||
|
`List (map (json_of_op pdf false) ops)
|
||||||
|
|
||||||
|
(* We need to make sure each page only has one page content stream. Otherwise,
|
||||||
|
if not split on op boundaries, each one would fail to parse on its own. *)
|
||||||
|
(* Future improvement. Don't blow up shared content streams. *)
|
||||||
|
let precombine_page_content pdf =
|
||||||
|
let pages' =
|
||||||
|
map
|
||||||
|
(fun page ->
|
||||||
|
match page.Pdfpage.content with
|
||||||
|
[] | [_] -> page
|
||||||
|
| _ ->
|
||||||
|
let operators =
|
||||||
|
Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
|
||||||
|
in
|
||||||
|
{page with Pdfpage.content = [Pdfops.stream_of_ops operators]}
|
||||||
|
)
|
||||||
|
(Pdfpage.pages_of_pagetree pdf)
|
||||||
|
in
|
||||||
|
Pdfpage.change_pages true pdf pages'
|
||||||
|
|
||||||
|
let json_of_pdf parse_content no_stream_data decompress_streams pdf =
|
||||||
|
let pdf = if parse_content then precombine_page_content pdf else pdf in
|
||||||
|
if decompress_streams then
|
||||||
|
Pdf.objiter
|
||||||
|
(fun n obj ->
|
||||||
|
Printf.eprintf "obj %i\n" n;
|
||||||
|
match obj with
|
||||||
|
| Pdf.Stream _ -> Printf.eprintf "decompressing...\n"; Pdfcodec.decode_pdfstream_until_unknown pdf obj
|
||||||
|
| _ -> ())
|
||||||
|
pdf;
|
||||||
|
Pdf.remove_unreferenced pdf;
|
||||||
|
let trailerdict = (0, json_of_object pdf (fun x -> ()) no_stream_data pdf.P.trailerdict) in
|
||||||
|
let parameters =
|
||||||
|
(-1, json_of_object pdf (fun x -> ()) false
|
||||||
|
(Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2);
|
||||||
|
("/CPDFJSONcontentparsed", Pdf.Boolean parse_content);
|
||||||
|
("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data));
|
||||||
|
("/CPDFJSONmajorpdfversion", Pdf.Integer pdf.Pdf.major);
|
||||||
|
("/CPDFJSONminorpdfversion", Pdf.Integer pdf.Pdf.minor);
|
||||||
|
]))
|
||||||
|
in
|
||||||
|
let content_streams = ref [] in
|
||||||
|
let fcs n =
|
||||||
|
content_streams := n::!content_streams;
|
||||||
|
if parse_content then Pdfcodec.decode_pdfstream_until_unknown pdf (P.lookup_obj pdf n)
|
||||||
|
in
|
||||||
|
let pairs =
|
||||||
|
let ps = ref [] in
|
||||||
|
P.objiter
|
||||||
|
(fun i pdfobj ->
|
||||||
|
ps := (i, json_of_object pdf fcs no_stream_data pdfobj)::!ps)
|
||||||
|
pdf;
|
||||||
|
parameters::trailerdict::!ps
|
||||||
|
in
|
||||||
|
let pairs_parsed =
|
||||||
|
if not parse_content then pairs else
|
||||||
|
map
|
||||||
|
(fun (objnum, obj) ->
|
||||||
|
if mem objnum !content_streams then
|
||||||
|
begin match obj with
|
||||||
|
| `Assoc ["S", `List [dict; `String _]] ->
|
||||||
|
(* FIXME Proper resources here for reasons explained above? *)
|
||||||
|
let streamdata =
|
||||||
|
match P.lookup_obj pdf objnum with
|
||||||
|
| P.Stream {contents = (_, P.Got b)} -> b
|
||||||
|
| _ -> error "JSON: stream not decoded"
|
||||||
|
in
|
||||||
|
(objnum, `Assoc ["S", `List [dict; parse_content_stream pdf (P.Dictionary []) streamdata]])
|
||||||
|
| _ -> error "json_of_pdf: stream parsing inconsistency"
|
||||||
|
end
|
||||||
|
else
|
||||||
|
(objnum, obj))
|
||||||
|
pairs
|
||||||
|
in
|
||||||
|
`List
|
||||||
|
(map
|
||||||
|
(fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj])
|
||||||
|
pairs_parsed)
|
||||||
|
|
||||||
|
(* FIXME Proper streaming to output *)
|
||||||
|
let to_output o parse_content no_stream_data decompress_streams pdf =
|
||||||
|
let json = json_of_pdf parse_content no_stream_data decompress_streams pdf in
|
||||||
|
o.Pdfio.output_string (J.pretty_to_string json)
|
||||||
|
|
||||||
let example_pdf =
|
let example_pdf =
|
||||||
let page =
|
let page =
|
||||||
|
|
Loading…
Reference in New Issue