cpdf-source/cpdfjson.ml

548 lines
22 KiB
OCaml
Raw Normal View History

2021-10-15 15:50:08 +02:00
(* Read and write PDF files in JSON format.
The file is an array of arrays containing an object number followed by an
object, one for each object in the file and two special ones:
Object -1: CPDF's own data with the PDF version number, CPDF JSON format
number, and flags used when writing (which may be required when reading):
2021-10-29 19:17:18 +02:00
o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 2)
2021-10-15 15:50:08 +02:00
o /CPDFJSONcontentparsed (boolean, true if content streams have been parsed)
o /CPDFJSONstreamdataincluded (boolean, true if stream data included. Cannot
round-trip if false).
2021-10-29 19:17:18 +02:00
o /CPDFJSONmajorpdfversion (CPDFJSON integer)
o /CPDFJSONminorpdfversion (CPDFJSON integer)
2021-10-15 15:50:08 +02:00
Object 0: The PDF's trailer dictionary
Objects 1..n: The PDF's objects.
o PDF arrays, dictionaries, booleans, and strings are the same in JSON.
o Integers are written as {"I": 0}
o Floats are written as {"F": 0.0}
o Names are written as {"N": "/Pages"}
o Indirect references are integers
o Streams are {"S": [dict, data]}
o Strings are converted into JSON strings in a way which is fully reversible.
2021-10-29 19:17:18 +02:00
2021-10-15 15:50:08 +02:00
There are two subformats: parsing content streams or not. Hello World in CPDF
JSON without parsing content streams:
[
[
-1, { "/CPDFJSONformatversion": { "I": 2 },
"/CPDFJSONcontentparsed": false, "/CPDFJSONstreamdataincluded": true,
"/CPDFJSONmajorpdfversion": { "I": 1 },
"/CPDFJSONminorpdfversion": { "I": 1 } } ], [
0, { "/Size": { "I": 4 }, "/Root": 4,
"/ID": [ "èÎ25\u001e³/°q:Oʇ°u‰", "èÎ25\u001e³/°q:Oʇ°u‰" ] } ], [
1, { "/Type": { "N": "/Pages" }, "/Kids": [ 3 ], "/Count": { "I": 1 } } ],
[
2, {
"S": [
{ "/Length": { "I": 49 } },
"1 0 0 1 50 770 cm BT/F0 36 Tf(Hello, World!)Tj ET"
] } ], [
3, { "/Type": { "N": "/Page" }, "/Parent": 1,
"/Resources": {
"/Font": {
"/F0": {
"/Type": { "N": "/Font" },
"/Subtype": { "N": "/Type1" },
"/BaseFont": { "N": "/Times-Italic" }
}
}
},
"/MediaBox": [
{ "I": 0 }, { "I": 0 }, { "F": 595.2755905510001 }, { "F": 841.88976378 }
], "/Rotate": { "I": 0 }, "/Contents": [ 2 ] } ], [
4, { "/Type": { "N": "/Catalog" }, "/Pages": 1 } ]
]
Alternative object number 2 when parsing of object streams in operation:
2, {
"S": [
{}, [
[
{ "F": 1.0 }, { "F": 0.0 }, { "F": 0.0 }, { "F": 1.0 }, { "F": 50.0 }, {
"F": 770.0 }, "cm" ], [ "BT" ], [ "/F0", { "F": 36.0 }, "Tf" ], [
"Hello, World!", "Tj" ], [ "ET" ] ]
] } ], [
When parsing content streams:
o Each operation is an array
o The 'operation' for inline images is "InlineImage"
CPDF currently never preserves object streams, and only outputs unencrypted files.
2021-12-30 16:25:24 +01:00
When reloading a JSON file, CPDF knows how to correct or add /Length entries in
2021-10-15 15:50:08 +02:00
streams, so you need not worry about them. *)
2020-03-04 19:50:32 +01:00
open Pdfutil
2021-10-04 14:34:13 +02:00
open Cpdferror
2020-03-04 19:50:32 +01:00
2021-10-06 15:06:14 +02:00
module J = Cpdfyojson.Safe
module P = Pdf
module O = Pdfops
2020-01-31 15:46:33 +01:00
2021-10-01 17:59:05 +02:00
let opf = function
2021-10-06 15:06:14 +02:00
| `Assoc ["F", `Float f] -> f
| `Assoc ["F", `Int i] -> float_of_int i
2021-10-04 14:34:13 +02:00
| _ -> error "num: not a float"
2021-10-01 17:59:05 +02:00
let opi = function
2021-10-06 15:06:14 +02:00
| `Assoc ["I", `Int i] -> i
| `Assoc ["I", `Float f] -> int_of_float f
| _ -> error "num: not an integer"
2021-10-01 17:59:05 +02:00
2021-10-03 19:11:29 +02:00
let rec op_of_json = function
2021-10-06 15:06:14 +02:00
| `List [`String "S"] -> O.Op_S
| `List [`String "s"] -> O.Op_s
| `List [`String "f"] -> O.Op_f
| `List [`String "F"] -> O.Op_F
| `List [`String "f*"] -> O.Op_f'
| `List [`String "B"] -> O.Op_B
| `List [`String "B*"] -> O.Op_B'
| `List [`String "b"] -> O.Op_b
| `List [`String "b*"] -> O.Op_b'
| `List [`String "n"] -> O.Op_n
| `List [`String "W"] -> O.Op_W
| `List [`String "W*"] -> O.Op_W'
| `List [`String "BT"] -> O.Op_BT
| `List [`String "ET"] -> O.Op_ET
| `List [`String "q"] -> O.Op_q
| `List [`String "Q"] -> O.Op_Q
| `List [`String "h"] -> O.Op_h
| `List [`String "T*"] -> O.Op_T'
| `List [`String "EMC"] -> O.Op_EMC
| `List [`String "BX"] -> O.Op_BX
| `List [`String "EX"] -> O.Op_EX
| `List [a; b; c; d; `String "re"] -> O.Op_re (opf a, opf b, opf c, opf d)
| `List [a; b; c; d; `String "k"] -> O.Op_k (opf a, opf b, opf c, opf d)
| `List [a; b; `String "m"] -> O.Op_m (opf a, opf b)
| `List [a; b; `String "l"] -> O.Op_l (opf a, opf b)
| `List [`String s; obj; `String "BDC"] -> O.Op_BDC (s, object_of_json obj)
| `List [`String s; `String "gs"] -> O.Op_gs s
| `List [`String s; `String "Do"] -> O.Op_Do s
| `List [`String s; `String "CS"] -> O.Op_CS s
| `List [i; `String "j"] -> O.Op_j (opi i)
| `List [a; b; c; d; e; f; `String "cm"] ->
2021-10-01 17:59:05 +02:00
O.Op_cm
2021-10-01 21:58:46 +02:00
{Pdftransform.a = opf a; Pdftransform.b = opf b; Pdftransform.c = opf c;
Pdftransform.d = opf d; Pdftransform.e = opf e; Pdftransform.f = opf f}
2021-10-06 15:06:14 +02:00
| `List [`List fls; y; `String "d"] -> O.Op_d (map opf fls, opf y)
| `List [a; `String "w"] -> O.Op_w (opf a)
| `List [a; `String "J"] -> O.Op_J (opi a)
| `List [a; `String "M"] -> O.Op_M (opf a)
| `List [`String s; `String "ri"] -> O.Op_ri s
| `List [a; `String "i"] -> O.Op_i (opi a)
| `List [a; b; c; d; e; f; `String "c"] -> O.Op_c (opf a, opf b, opf c, opf d, opf e, opf f)
| `List [a; b; c; d; `String "v"] -> O.Op_v (opf a, opf b, opf c, opf d)
| `List [a; b; c; d; `String "y"] -> O.Op_y (opf a, opf b, opf c, opf d)
| `List [a; `String "Tc"] -> O.Op_Tc (opf a)
| `List [a; `String "Tw"] -> O.Op_Tw (opf a)
| `List [a; `String "Tz"] -> O.Op_Tz (opf a)
| `List [a; `String "TL"] -> O.Op_TL (opf a)
| `List [`String k; n; `String "Tf"] -> O.Op_Tf (k, opf n)
| `List [a; `String "Tr"] -> O.Op_Tr (opi a)
| `List [a; `String "Ts"] -> O.Op_Ts (opf a)
| `List [a; b; `String "Td"] -> O.Op_Td (opf a, opf b)
| `List [a; b; `String "TD"] -> O.Op_TD (opf a, opf b)
| `List [a; b; c; d; e; f; `String "Tm"] ->
2021-10-01 21:58:46 +02:00
O.Op_Tm
{Pdftransform.a = opf a; Pdftransform.b = opf b; Pdftransform.c = opf c;
Pdftransform.d = opf d; Pdftransform.e = opf e; Pdftransform.f = opf f}
2021-10-06 15:06:14 +02:00
| `List [`String s; `String "Tj"] -> Op_Tj s
| `List [obj; `String "TJ"] -> Op_TJ (object_of_json obj)
| `List [`String s; `String "'"] -> Op_' s
| `List [a; b; `String s; `String "''"] -> Op_'' (opf a, opf b, s)
| `List [a; b; `String "d0"] -> Op_d0 (opf a, opf b)
| `List [a; b; c; d; e; f; `String "d1"] -> Op_d1 (opf a, opf b, opf c, opf d, opf e, opf f)
| `List [`String s; `String "cs"] -> Op_cs s
| `List [a; `String "G"] -> Op_G (opf a);
| `List [a; `String "g"] -> Op_g (opf a);
| `List [a; b; c; `String "RG"] -> Op_RG (opf a, opf b, opf c);
| `List [a; b; c; `String "rg"] -> Op_rg (opf a, opf b, opf c);
| `List [a; b; c; d; `String "K"] -> Op_K (opf a, opf b, opf c, opf d);
| `List [`String s; `String "sh"] -> Op_sh s;
| `List [`String s; `String "MP"] -> Op_MP s;
| `List [`String s; `String "BMC"] -> Op_BMC s;
| `List [`String s; `String "Unknown"] -> O.Op_Unknown s
| `List [`String s; obj; `String "DP"] -> O.Op_DP (s, object_of_json obj)
| `List [a; `String b; `String "InlineImage"] ->
2021-10-04 14:34:13 +02:00
O.InlineImage (object_of_json a, Pdfio.bytes_of_string b)
2021-10-06 15:06:14 +02:00
| `List torev ->
2021-10-01 21:58:46 +02:00
begin match rev torev with
2021-10-06 15:06:14 +02:00
| `String "SCN"::ns -> O.Op_SCN (map opf (rev ns))
2021-10-12 20:52:03 +02:00
| `String "SC"::ns -> O.Op_SC (map opf (rev ns))
| `String "sc"::ns -> O.Op_sc (map opf (rev ns))
| `String "scn"::ns -> O.Op_scn (map opf (rev ns))
2021-10-06 15:06:14 +02:00
| `String "SCNName"::`String s::ns -> O.Op_SCNName (s, map opf (rev ns))
| `String "scnName"::`String s::ns -> O.Op_scnName (s, map opf (rev ns))
2021-10-01 21:58:46 +02:00
| j ->
2021-10-06 15:06:14 +02:00
Printf.eprintf "Unable to read reversed op from %s\n" (J.show (`List j));
2021-10-04 14:34:13 +02:00
error "op reading failed"
2021-10-01 21:58:46 +02:00
end
2021-10-01 20:41:23 +02:00
| j ->
Printf.eprintf "Unable to read op from %s\n" (J.show j);
2021-10-04 14:34:13 +02:00
error "op reading failed"
2021-10-01 17:59:05 +02:00
2021-10-03 19:11:29 +02:00
and object_of_json = function
2021-10-06 15:06:14 +02:00
| `Null -> P.Null
| `Bool b -> P.Boolean b
| `Int n -> Pdf.Indirect n
| `String s -> P.String s
| `List objs -> P.Array (map object_of_json objs)
| `Assoc ["I", `Int i] -> P.Integer i
| `Assoc ["F", `Float f] -> P.Real f
| `Assoc ["N", `String n] -> P.Name n
| `Assoc ["S", `List [dict; `String data]] ->
2021-10-05 14:08:46 +02:00
let d' =
P.add_dict_entry (object_of_json dict) "/Length" (P.Integer (String.length data))
in
P.Stream (ref (d', P.Got (Pdfio.bytes_of_string data)))
2021-10-06 15:06:14 +02:00
| `Assoc ["S", `List [dict; `List parsed_ops]] ->
2021-10-14 16:54:09 +02:00
begin match
Pdfops.stream_of_ops (List.map op_of_json parsed_ops)
with
| P.Stream {contents = (_, Pdf.Got data)} ->
let d' =
P.add_dict_entry (object_of_json dict) "/Length" (P.Integer (Pdfio.bytes_size data))
in
P.Stream (ref (d', Pdf.Got data))
| _ -> assert false
end
2021-10-06 15:06:14 +02:00
| `Assoc elts -> P.Dictionary (map (fun (n, o) -> (n, object_of_json o)) elts)
| _ -> error "not recognised in object_of_json"
2021-10-06 14:52:05 +02:00
let pdf_of_json json =
2021-10-06 15:06:14 +02:00
let objs = match json with `List objs -> objs | _ -> error "bad json top level" in
2021-10-06 14:52:05 +02:00
let params = ref Pdf.Null in
let trailerdict = ref Pdf.Null in
let objects =
option_map
(function
2021-10-06 15:06:14 +02:00
| `List [`Int objnum; o] ->
begin match objnum with
| -1 -> params := object_of_json o; None
| 0 -> trailerdict := object_of_json o; None
| n when n < 0 -> None
| n -> Some (n, object_of_json o)
end
2021-10-06 14:52:05 +02:00
| _ -> error "json bad obj")
objs
in
begin match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONstreamdataincluded" !params with
| Some (Pdf.Boolean false) -> error "no stream data; cannot reconstruct PDF"
| _ -> ()
end;
let major =
match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONmajorpdfversion" !params with
Some (Pdf.Integer i) -> i | _ -> error "bad major version"
in
let minor =
match Pdf.lookup_direct (Pdf.empty ()) "/CPDFJSONminorpdfversion" !params with
Some (Pdf.Integer i) -> i | _ -> error "bad minor version"
in
let root =
match !trailerdict with Pdf.Dictionary d ->
begin match lookup "/Root" d with
Some (Pdf.Indirect i) -> i | _ -> error "bad root"
end
| _ -> error "bad root 2"
in
let objmap = P.pdfobjmap_empty () in
2021-10-06 18:04:54 +02:00
List.iter (fun (k, v) -> Hashtbl.add objmap k (ref (P.Parsed v), 0)) objects;
let objects =
{P.maxobjnum = 0;
P.parse = None;
P.pdfobjects = objmap;
P.object_stream_ids = Hashtbl.create 0}
in
{P.major;
P.minor;
P.root;
P.objects;
P.trailerdict = !trailerdict;
P.was_linearized = false;
P.saved_encryption = None}
let mkfloat f = `Assoc [("F", `Float f)]
let mkint i = `Assoc [("I", `Int i)]
let mkname n = `Assoc [("N", `String n)]
2021-10-06 14:52:05 +02:00
2021-12-30 16:25:24 +01:00
let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = function
2021-10-06 14:52:05 +02:00
| P.Null -> `Null
| P.Boolean b -> `Bool b
2021-10-06 18:04:54 +02:00
| P.Integer i -> mkint i
| P.Real r -> mkfloat r
2021-12-30 16:25:24 +01:00
| P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else s)
2021-10-06 18:04:54 +02:00
| P.Name n -> mkname n
2021-10-12 17:09:58 +02:00
| P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data pcs) objs)
| P.Dictionary elts ->
iter
(function
("/Contents", P.Indirect i) ->
begin match Pdf.lookup_obj pdf i with
| Pdf.Array is -> iter (function Pdf.Indirect i -> fcs i | _ -> ()) is
| _ -> fcs i
end
| ("/Contents", P.Array elts) -> iter (function P.Indirect i -> fcs i | _ -> ()) elts
| _ -> ())
elts;
2021-10-12 17:09:58 +02:00
`Assoc (map (fun (k, v) -> (k, json_of_object pdf fcs no_stream_data pcs v)) elts)
2021-10-12 17:03:22 +02:00
| P.Stream ({contents = (P.Dictionary dict as d, stream)} as mut) as thestream ->
P.getstream thestream;
2021-10-14 21:51:25 +02:00
let str, dict' =
2021-10-12 17:09:58 +02:00
match P.lookup_direct pdf "/FunctionType" d, pcs with
| Some _, true ->
2021-10-12 17:03:22 +02:00
Pdfcodec.decode_pdfstream_until_unknown pdf thestream;
2021-10-14 21:51:25 +02:00
let dict = P.remove_dict_entry d "/Filter" in
begin match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, dict) | _ -> error "/FunctionType: failure: decomp" end
2021-10-12 17:09:58 +02:00
| _ ->
2021-10-14 21:51:25 +02:00
if no_stream_data then ("<<stream data elided>>", d) else
match !mut with (_, P.Got b) -> (Pdfio.string_of_bytes b, d) | _ -> error "failure: toget"
in
2021-10-14 21:51:25 +02:00
json_of_object pdf fcs no_stream_data pcs (P.Dictionary [("S", P.Array [dict'; P.String str])])
2021-10-06 14:52:05 +02:00
| P.Stream _ -> error "error: stream with not-a-dictionary"
| P.Indirect i ->
begin match P.lookup_obj pdf i with
| P.Stream {contents = (P.Dictionary dict as d, _)} ->
begin match P.lookup_direct pdf "/Subtype" d with
| Some (P.Name "/Form") -> fcs i
2021-10-14 16:54:09 +02:00
| _ ->
begin match P.lookup_direct pdf "/Type" d with
| Some (P.Name "/Pattern") -> fcs i
| _ -> ()
end
end
| _ -> ()
end;
2021-10-06 14:52:05 +02:00
`Int i
2020-02-04 17:50:04 +01:00
let json_of_op pdf no_stream_data = function
2021-10-06 14:52:05 +02:00
| O.Op_S -> `List [`String "S"]
| O.Op_s -> `List [`String "s"]
| O.Op_f -> `List [`String "f"]
| O.Op_F -> `List [`String "F"]
| O.Op_f' ->`List [`String "f*"]
| O.Op_B -> `List [`String "B"]
| O.Op_B' -> `List [`String "B*"]
| O.Op_b -> `List [`String "b"]
| O.Op_b' -> `List [`String "b*"]
| O.Op_n -> `List [`String "n"]
| O.Op_W -> `List [`String "W"]
| O.Op_W' -> `List [`String "W*"]
| O.Op_BT -> `List [`String "BT"]
| O.Op_ET -> `List [`String "ET"]
| O.Op_q -> `List [`String "q"]
| O.Op_Q -> `List [`String "Q"]
| O.Op_h -> `List [`String "h"]
| O.Op_T' -> `List [`String "T*"]
| O.Op_EMC -> `List [`String "EMC"]
| O.Op_BX -> `List [`String "BX"]
| O.Op_EX -> `List [`String "EX"]
2020-01-30 17:07:40 +01:00
| O.Op_re (a, b, c, d) ->
2021-10-06 18:04:54 +02:00
`List [mkfloat a; mkfloat b; mkfloat c; mkfloat d; `String "re"]
2020-01-30 17:07:40 +01:00
| O.Op_k (c, m, y, k) ->
2021-10-06 18:04:54 +02:00
`List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "k"]
| O.Op_m (a, b) -> `List [mkfloat a; mkfloat b; `String "m"]
| O.Op_l (a, b) -> `List [mkfloat a; mkfloat b; `String "l"]
2021-10-12 17:09:58 +02:00
| O.Op_BDC (s, obj) -> `List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data false obj; `String "BDC"]
2021-10-06 14:52:05 +02:00
| O.Op_gs s -> `List [`String s; `String "gs"]
| O.Op_Do s -> `List [`String s; `String "Do"]
| O.Op_CS s -> `List [`String s; `String "CS"]
2021-10-06 18:04:54 +02:00
| O.Op_SCN fs -> `List ((map (fun x -> mkfloat x) fs) @ [`String "SCN"])
| O.Op_j j -> `List [mkint j; `String "j"]
2020-01-30 17:07:40 +01:00
| O.Op_cm t ->
2021-10-06 14:52:05 +02:00
`List
2021-10-06 18:04:54 +02:00
[mkfloat t.Pdftransform.a; mkfloat t.Pdftransform.b; mkfloat t.Pdftransform.c;
mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f;
2021-10-06 14:52:05 +02:00
`String "cm"]
2020-01-30 17:07:40 +01:00
| O.Op_d (fl, y) ->
2021-10-06 18:04:54 +02:00
`List [`List (map (fun x -> mkfloat x) fl); mkfloat y; `String "d"]
| O.Op_w w -> `List [mkfloat w; `String "w"]
| O.Op_J j -> `List [mkint j; `String "J"]
| O.Op_M m -> `List [mkfloat m; `String "M"]
2021-10-06 14:52:05 +02:00
| O.Op_ri s -> `List [`String s; `String "ri"]
2021-10-06 18:04:54 +02:00
| O.Op_i i -> `List [mkint i; `String "i"]
2021-10-14 21:28:43 +02:00
| O.Op_c (a, b, c, d, e, f) ->
2021-10-06 14:52:05 +02:00
`List
2021-10-06 18:04:54 +02:00
[mkfloat a; mkfloat b; mkfloat c;
2021-10-14 21:28:43 +02:00
mkfloat d; mkfloat e; mkfloat f; `String "c"]
2020-01-31 15:24:36 +01:00
| O.Op_v (a, b, c, d) ->
2021-10-06 14:52:05 +02:00
`List
2021-10-06 18:04:54 +02:00
[mkfloat a; mkfloat b; mkfloat c;
mkfloat d; `String "v"]
2020-01-31 15:24:36 +01:00
| O.Op_y (a, b, c, d) ->
2021-10-06 14:52:05 +02:00
`List
2021-10-06 18:04:54 +02:00
[mkfloat a; mkfloat b; mkfloat c;
mkfloat d; `String "y"]
| O.Op_Tc c -> `List [mkfloat c; `String "Tc"]
| O.Op_Tw w -> `List [mkfloat w; `String "Tw"]
| O.Op_Tz z -> `List [mkfloat z; `String "Tz"]
| O.Op_TL l -> `List [mkfloat l; `String "TL"]
| O.Op_Tf (k, s) -> `List [`String k; mkfloat s; `String "Tf"]
| O.Op_Tr i -> `List [mkint i; `String "Tr"]
| O.Op_Ts k -> `List [mkfloat k; `String "Ts"]
| O.Op_Td (k, k') -> `List [mkfloat k; mkfloat k'; `String "Td"]
| O.Op_TD (k, k') -> `List [mkfloat k; mkfloat k'; `String "TD"]
2020-01-31 15:24:36 +01:00
| O.Op_Tm t ->
2021-10-06 14:52:05 +02:00
`List
2021-10-06 18:04:54 +02:00
[mkfloat t.Pdftransform.a; mkfloat t.Pdftransform.b; mkfloat t.Pdftransform.c;
mkfloat t.Pdftransform.d; mkfloat t.Pdftransform.e; mkfloat t.Pdftransform.f;
2021-10-06 14:52:05 +02:00
`String "Tm"]
| O.Op_Tj s -> `List [`String s; `String "Tj"]
2021-10-12 17:09:58 +02:00
| O.Op_TJ pdfobject -> `List [json_of_object pdf (fun _ -> ()) no_stream_data false pdfobject; `String "TJ"]
2021-10-06 14:52:05 +02:00
| O.Op_' s -> `List [`String s; `String "'"]
2021-10-06 18:04:54 +02:00
| O.Op_'' (k, k', s) -> `List [mkfloat k; mkfloat k'; `String s; `String "''"]
| O.Op_d0 (k, k') -> `List [mkfloat k; mkfloat k'; `String "d0"]
2021-10-14 21:28:43 +02:00
| O.Op_d1 (a, b, c, d, e, f) ->
2021-10-06 14:52:05 +02:00
`List
2021-10-06 18:04:54 +02:00
[mkfloat a; mkfloat b; mkfloat c;
2021-10-14 21:28:43 +02:00
mkfloat d; mkfloat e; mkfloat f; `String "d1"]
2021-10-06 14:52:05 +02:00
| O.Op_cs s -> `List [`String s; `String "cs"]
2021-10-06 18:04:54 +02:00
| O.Op_SC fs -> `List (map (fun x -> mkfloat x) fs @ [`String "SC"])
| O.Op_sc fs -> `List (map (fun x -> mkfloat x) fs @ [`String "sc"])
| O.Op_scn fs -> `List (map (fun x -> mkfloat x) fs @ [`String "scn"])
| O.Op_G k -> `List [mkfloat k; `String "G"]
| O.Op_g k -> `List [mkfloat k; `String "g"]
| O.Op_RG (r, g, b) -> `List [mkfloat r; mkfloat g; mkfloat b; `String "RG"]
| O.Op_rg (r, g, b) -> `List [mkfloat r; mkfloat g; mkfloat b; `String "rg"]
| O.Op_K (c, m, y, k) -> `List [mkfloat c; mkfloat m; mkfloat y; mkfloat k; `String "K"]
2021-10-06 14:52:05 +02:00
| O.Op_sh s -> `List [`String s; `String "sh"]
| O.Op_MP s -> `List [`String s; `String "MP"]
| O.Op_BMC s -> `List [`String s; `String "BMC"]
| O.Op_Unknown s -> `List [`String s; `String "Unknown"]
2020-01-31 15:46:33 +01:00
| O.Op_SCNName (s, fs) ->
2021-10-06 18:04:54 +02:00
`List (map (fun x -> mkfloat x) fs @ [`String s; `String "SCNName"])
2020-01-31 15:46:33 +01:00
| O.Op_scnName (s, fs) ->
2021-10-06 18:04:54 +02:00
`List (map (fun x -> mkfloat x) fs @ [`String s; `String "scnName"])
2021-10-06 14:52:05 +02:00
| O.InlineImage (dict, data) ->
2021-10-12 17:09:58 +02:00
`List [json_of_object pdf (fun _ -> ()) no_stream_data false dict; `String (Pdfio.string_of_bytes data); `String "InlineImage"]
2021-10-06 14:52:05 +02:00
| O.Op_DP (s, obj) ->
2021-10-12 17:09:58 +02:00
`List [`String s; json_of_object pdf (fun _ -> ()) no_stream_data false obj; `String "DP"]
2021-10-06 14:52:05 +02:00
2020-01-31 13:17:55 +01:00
(* parse_stream needs pdf and resources. These are for lexing of inline images,
2021-10-14 16:54:09 +02:00
* looking up the colourspace. *)
2020-01-31 13:17:55 +01:00
let parse_content_stream pdf resources bs =
2020-03-25 14:46:54 +01:00
let ops = O.parse_stream pdf resources [bs] in
2021-10-06 14:52:05 +02:00
`List (map (json_of_op pdf false) ops)
2020-01-30 14:10:03 +01:00
2021-10-14 18:42:00 +02:00
(* Make sure each page only has one page content stream. Otherwise,
2021-10-15 15:50:08 +02:00
if not split on op boundaries, each one would fail to parse on its own. *)
2021-10-14 21:28:43 +02:00
let precombine_page_content pdf =
2021-06-21 16:03:32 +02:00
let pages' =
map
(fun page ->
match page.Pdfpage.content with
[] | [_] -> page
| _ ->
let operators =
Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
in
{page with Pdfpage.content = [Pdfops.stream_of_ops operators]}
)
(Pdfpage.pages_of_pagetree pdf)
in
Pdfpage.change_pages true pdf pages'
2021-12-30 16:25:24 +01:00
(* Convert any strings in UTF16BE which could actually be in PDFDocEncoding
(due to having no high bytes) to make editing JSON easier. *)
2021-10-29 19:17:18 +02:00
let rec ppstring_single_object pdf = function
| Pdf.Dictionary d -> Pdf.recurse_dict (ppstring_single_object pdf) d
| (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) ->
Pdf.Stream {contents = (Pdf.recurse_dict (ppstring_single_object pdf) dict, data)}
| Pdf.Array a -> Pdf.recurse_array (ppstring_single_object pdf) a
2021-11-01 14:17:54 +01:00
| Pdf.String s -> Pdf.String (Pdftext.simplify_utf16be s)
2021-10-29 19:17:18 +02:00
| x -> x
2021-12-30 16:25:24 +01:00
(* Do all objects, but skip the trailer dictionary since may mess up /ID if it
happens to begin with UTF16BE BOM *)
2021-10-29 19:17:18 +02:00
let preprocess_strings pdf =
2021-12-29 16:58:03 +01:00
Pdf.objselfmap (ppstring_single_object pdf) pdf
2021-10-29 19:17:18 +02:00
2021-10-12 16:35:08 +02:00
let json_of_pdf
2021-12-30 16:25:24 +01:00
~parse_content ~no_stream_data ~decompress_streams ~clean_strings
2021-10-12 16:35:08 +02:00
pdf
=
2021-12-30 16:25:24 +01:00
if clean_strings then preprocess_strings pdf;
2021-10-14 21:28:43 +02:00
let pdf = if parse_content then precombine_page_content pdf else pdf in
2021-10-04 19:38:36 +02:00
if decompress_streams then
2022-01-14 16:21:54 +01:00
Pdf.objiter
(fun _ obj -> match obj with Pdf.Stream _ -> Pdfcodec.decode_pdfstream_until_unknown pdf obj | _ -> ())
pdf;
2021-06-21 16:03:32 +02:00
Pdf.remove_unreferenced pdf;
2021-10-12 17:09:58 +02:00
let trailerdict = (0, json_of_object pdf (fun x -> ()) no_stream_data false pdf.P.trailerdict) in
2021-10-01 13:16:55 +02:00
let parameters =
2021-10-12 17:09:58 +02:00
(-1, json_of_object pdf (fun x -> ()) false false
2021-10-01 16:21:03 +02:00
(Pdf.Dictionary [("/CPDFJSONformatversion", Pdf.Integer 2);
2021-10-01 13:16:55 +02:00
("/CPDFJSONcontentparsed", Pdf.Boolean parse_content);
2021-10-01 16:21:03 +02:00
("/CPDFJSONstreamdataincluded", Pdf.Boolean (not no_stream_data));
("/CPDFJSONmajorpdfversion", Pdf.Integer pdf.Pdf.major);
("/CPDFJSONminorpdfversion", Pdf.Integer pdf.Pdf.minor);
]))
2021-10-01 13:16:55 +02:00
in
2020-01-30 14:10:03 +01:00
let content_streams = ref [] in
2021-10-04 19:38:36 +02:00
let fcs n =
content_streams := n::!content_streams;
if parse_content then Pdfcodec.decode_pdfstream_until_unknown pdf (P.lookup_obj pdf n)
in
2020-01-30 14:10:03 +01:00
let pairs =
let ps = ref [] in
2020-03-25 14:46:54 +01:00
P.objiter
2020-01-30 14:10:03 +01:00
(fun i pdfobj ->
2021-10-12 17:09:58 +02:00
ps := (i, json_of_object pdf fcs no_stream_data parse_content pdfobj)::!ps)
2020-01-30 14:10:03 +01:00
pdf;
2021-10-14 22:29:03 +02:00
parameters::trailerdict::sort compare !ps
2020-01-30 14:10:03 +01:00
in
let pairs_parsed =
if not parse_content then pairs else
2020-03-04 19:50:32 +01:00
map
2020-01-30 14:10:03 +01:00
(fun (objnum, obj) ->
2020-03-04 19:50:32 +01:00
if mem objnum !content_streams then
2020-01-30 14:10:03 +01:00
begin match obj with
2021-10-06 14:52:05 +02:00
| `Assoc ["S", `List [dict; `String _]] ->
2020-01-31 13:17:55 +01:00
let streamdata =
2020-03-25 14:46:54 +01:00
match P.lookup_obj pdf objnum with
| P.Stream {contents = (_, P.Got b)} -> b
2021-10-04 14:34:13 +02:00
| _ -> error "JSON: stream not decoded"
2020-01-31 13:17:55 +01:00
in
2021-10-14 18:42:00 +02:00
let dict =
match dict with
| `Assoc d ->
`Assoc (option_map (function (("/Filter" | "/Length"), _) -> None | (a, b) -> Some (a, b)) d)
| _ -> assert false
in
(objnum,
`Assoc ["S", `List [dict; parse_content_stream pdf (P.Dictionary []) streamdata]])
2021-10-04 14:34:13 +02:00
| _ -> error "json_of_pdf: stream parsing inconsistency"
2020-01-30 14:10:03 +01:00
end
else
(objnum, obj))
pairs
in
2021-10-06 14:52:05 +02:00
`List
2020-03-04 19:50:32 +01:00
(map
2021-10-06 14:52:05 +02:00
(fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj])
2020-01-30 14:10:03 +01:00
pairs_parsed)
2020-01-30 11:42:24 +01:00
2021-12-30 16:25:24 +01:00
let to_output o ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf =
let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in
2021-10-12 20:52:03 +02:00
match o.Pdfio.out_caml_channel with
| Some ch -> J.pretty_to_channel ch json
| None -> o.Pdfio.output_string (J.pretty_to_string json)
2021-10-01 13:16:55 +02:00
2021-10-01 13:53:21 +02:00
let of_input i =
2021-10-12 20:52:03 +02:00
try
match i.Pdfio.caml_channel with
2021-10-14 16:54:09 +02:00
| Some ch ->
2021-10-14 16:55:56 +02:00
pdf_of_json (J.from_channel ch)
2021-10-12 20:52:03 +02:00
| None ->
2021-10-14 16:54:09 +02:00
let content = Pdfio.string_of_bytes (Pdfio.bytes_of_input i 0 i.Pdfio.in_channel_length) in
2021-10-12 20:52:03 +02:00
pdf_of_json (J.from_string content)
with
e -> error (Printexc.to_string e)