From 86f5cb05a813c93ffbd2cc599d044e8b4cc277f7 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Fri, 15 Oct 2021 14:50:08 +0100 Subject: [PATCH] more --- cpdfjson.ml | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 82 insertions(+), 5 deletions(-) diff --git a/cpdfjson.ml b/cpdfjson.ml index 93f61a2..141d4e1 100644 --- a/cpdfjson.ml +++ b/cpdfjson.ml @@ -1,5 +1,84 @@ -(* FIXME investigate whether we need to look at inherited resources more *) -(* FIXME document format at top of this file *) +(* Read and write PDF files in JSON format. + +The file is an array of arrays containing an object number followed by an +object, one for each object in the file and two special ones: + +Object -1: CPDF's own data with the PDF version number, CPDF JSON format +number, and flags used when writing (which may be required when reading): + + o /CPDFJSONformatversion (integer, currently 2) + o /CPDFJSONcontentparsed (boolean, true if content streams have been parsed) + o /CPDFJSONstreamdataincluded (boolean, true if stream data included. Cannot + round-trip if false). + o /CPDFJSONmajorpdfversion (integer) + o /CPDFJSONminorpdfversion (integer) + +Object 0: The PDF's trailer dictionary + +Objects 1..n: The PDF's objects. + + o PDF arrays, dictionaries, booleans, and strings are the same in JSON. + o Integers are written as {"I": 0} + o Floats are written as {"F": 0.0} + o Names are written as {"N": "/Pages"} + o Indirect references are integers + o Streams are {"S": [dict, data]} + +There are two subformats: parsing content streams or not. Hello World in CPDF +JSON without parsing content streams: + +[ + [ + -1, { "/CPDFJSONformatversion": { "I": 2 }, + "/CPDFJSONcontentparsed": false, "/CPDFJSONstreamdataincluded": true, + "/CPDFJSONmajorpdfversion": { "I": 1 }, + "/CPDFJSONminorpdfversion": { "I": 1 } } ], [ + 0, { "/Size": { "I": 4 }, "/Root": 4, + "/ID": [ "èÎ25\u001e³/°q:Oʇ°u‰", "èÎ25\u001e³/°q:Oʇ°u‰" ] } ], [ + 1, { "/Type": { "N": "/Pages" }, "/Kids": [ 3 ], "/Count": { "I": 1 } } ], + [ + 2, { + "S": [ + { "/Length": { "I": 49 } }, + "1 0 0 1 50 770 cm BT/F0 36 Tf(Hello, World!)Tj ET" + ] } ], [ + 3, { "/Type": { "N": "/Page" }, "/Parent": 1, + "/Resources": { + "/Font": { + "/F0": { + "/Type": { "N": "/Font" }, + "/Subtype": { "N": "/Type1" }, + "/BaseFont": { "N": "/Times-Italic" } + } + } + }, + "/MediaBox": [ + { "I": 0 }, { "I": 0 }, { "F": 595.2755905510001 }, { "F": 841.88976378 } + ], "/Rotate": { "I": 0 }, "/Contents": [ 2 ] } ], [ + 4, { "/Type": { "N": "/Catalog" }, "/Pages": 1 } ] +] + +Alternative object number 2 when parsing of object streams in operation: + +2, { +"S": [ + {}, [ + [ + { "F": 1.0 }, { "F": 0.0 }, { "F": 0.0 }, { "F": 1.0 }, { "F": 50.0 }, { + "F": 770.0 }, "cm" ], [ "BT" ], [ "/F0", { "F": 36.0 }, "Tf" ], [ + "Hello, World!", "Tj" ], [ "ET" ] ] +] } ], [ + +When parsing content streams: + + o Each operation is an array + o The 'operation' for inline images is "InlineImage" + +CPDF currently never preserves object streams, and only outputs unencrypted files. + +When reloading a JSON file, CPDF knows how to correct /Length entries in +streams, so you need not worry about them. *) + open Pdfutil open Cpdferror @@ -352,9 +431,7 @@ let parse_content_stream pdf resources bs = `List (map (json_of_op pdf false) ops) (* Make sure each page only has one page content stream. Otherwise, - if not split on op boundaries, each one would fail to parse on its own. The - caller should really only do this on otherwise-failing files, since it could - blow up any shared content streams. *) + if not split on op boundaries, each one would fail to parse on its own. *) let precombine_page_content pdf = let pages' = map