mirror of
				https://github.com/johnwhitington/cpdf-source.git
				synced 2025-06-05 22:09:39 +02:00 
			
		
		
		
	Fixing up JSON text output
This commit is contained in:
		| @@ -26,7 +26,7 @@ let annotations_json_page pdf page pagenum = | |||||||
|   | Some (Pdf.Array annots) -> |   | Some (Pdf.Array annots) -> | ||||||
|       map |       map | ||||||
|         (fun annot -> |         (fun annot -> | ||||||
|            `List [`Int pagenum; Cpdfjson.json_of_object pdf (fun _ -> ()) false false annot]) |            `List [`Int pagenum; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false annot]) | ||||||
|         (map (Pdf.direct pdf) annots) |         (map (Pdf.direct pdf) annots) | ||||||
|   | _ -> [] |   | _ -> [] | ||||||
|  |  | ||||||
|   | |||||||
| @@ -184,7 +184,7 @@ let output_json_marks ch calculate_page_number pdf fastrefnums marks = | |||||||
|   let json_of_mark m = |   let json_of_mark m = | ||||||
|     `Assoc |     `Assoc | ||||||
|        [("level", `Int m.Pdfmarks.level); |        [("level", `Int m.Pdfmarks.level); | ||||||
|         ("text", `String (Pdftext.utf8_of_pdfdocstring m.Pdfmarks.text)); |         ("text", `String (Pdftext.utf8_of_pdfdocstring (Pdftext.simplify_utf16be m.Pdfmarks.text))); | ||||||
|         ("page", `Int (calculate_page_number m)); |         ("page", `Int (calculate_page_number m)); | ||||||
|         ("open", `Bool m.Pdfmarks.isopen); |         ("open", `Bool m.Pdfmarks.isopen); | ||||||
|         ("target", json_of_target pdf fastrefnums m.Pdfmarks.target)] |         ("target", json_of_target pdf fastrefnums m.Pdfmarks.target)] | ||||||
|   | |||||||
| @@ -462,6 +462,7 @@ type args = | |||||||
|    mutable jsonparsecontentstreams : bool; |    mutable jsonparsecontentstreams : bool; | ||||||
|    mutable jsonnostreamdata : bool; |    mutable jsonnostreamdata : bool; | ||||||
|    mutable jsondecompressstreams : bool; |    mutable jsondecompressstreams : bool; | ||||||
|  |    mutable jsoncleanstrings : bool; | ||||||
|    mutable ocgrenamefrom : string; |    mutable ocgrenamefrom : string; | ||||||
|    mutable ocgrenameto : string; |    mutable ocgrenameto : string; | ||||||
|    mutable dedup : bool; |    mutable dedup : bool; | ||||||
| @@ -581,6 +582,7 @@ let args = | |||||||
|    jsonparsecontentstreams = false; |    jsonparsecontentstreams = false; | ||||||
|    jsonnostreamdata = false; |    jsonnostreamdata = false; | ||||||
|    jsondecompressstreams = false; |    jsondecompressstreams = false; | ||||||
|  |    jsoncleanstrings = false; | ||||||
|    ocgrenamefrom = ""; |    ocgrenamefrom = ""; | ||||||
|    ocgrenameto = ""; |    ocgrenameto = ""; | ||||||
|    dedup = false; |    dedup = false; | ||||||
| @@ -685,6 +687,7 @@ let reset_arguments () = | |||||||
|   args.jsonparsecontentstreams <- false; |   args.jsonparsecontentstreams <- false; | ||||||
|   args.jsonnostreamdata <- false; |   args.jsonnostreamdata <- false; | ||||||
|   args.jsondecompressstreams <- false; |   args.jsondecompressstreams <- false; | ||||||
|  |   args.jsoncleanstrings <- false; | ||||||
|   args.ocgrenamefrom <- ""; |   args.ocgrenamefrom <- ""; | ||||||
|   args.ocgrenameto <- ""; |   args.ocgrenameto <- ""; | ||||||
|   args.dedup <- false; |   args.dedup <- false; | ||||||
| @@ -1614,6 +1617,9 @@ let setjsonnostreamdata () = | |||||||
| let setjsondecompressstreams () = | let setjsondecompressstreams () = | ||||||
|   args.jsondecompressstreams <- true |   args.jsondecompressstreams <- true | ||||||
|  |  | ||||||
|  | let setjsoncleanstrings () = | ||||||
|  |   args.jsoncleanstrings <- true | ||||||
|  |  | ||||||
| let setocgrenamefrom s = | let setocgrenamefrom s = | ||||||
|   args.ocgrenamefrom <- s |   args.ocgrenamefrom <- s | ||||||
|  |  | ||||||
| @@ -2426,6 +2432,9 @@ and specs = | |||||||
|    ("-output-json-decompress-streams", |    ("-output-json-decompress-streams", | ||||||
|      Arg.Unit setjsondecompressstreams, |      Arg.Unit setjsondecompressstreams, | ||||||
|      " Skip stream data for brevity"); |      " Skip stream data for brevity"); | ||||||
|  |    ("-output-json-clean-strings", | ||||||
|  |      Arg.Unit setjsoncleanstrings, | ||||||
|  |      " Convert UTF16BE strings to PDFDocEncoding when possible"); | ||||||
|    ("-j", |    ("-j", | ||||||
|      Arg.String set_json_input, |      Arg.String set_json_input, | ||||||
|      " Load a PDF JSON file"); |      " Load a PDF JSON file"); | ||||||
| @@ -2959,6 +2968,7 @@ let write_json output pdf = | |||||||
|         ~parse_content:args.jsonparsecontentstreams |         ~parse_content:args.jsonparsecontentstreams | ||||||
|         ~no_stream_data:args.jsonnostreamdata |         ~no_stream_data:args.jsonnostreamdata | ||||||
|         ~decompress_streams:args.jsondecompressstreams |         ~decompress_streams:args.jsondecompressstreams | ||||||
|  |         ~clean_strings:args.jsoncleanstrings | ||||||
|         pdf |         pdf | ||||||
|   | File filename -> |   | File filename -> | ||||||
|       let f = open_out filename in |       let f = open_out filename in | ||||||
| @@ -2967,6 +2977,7 @@ let write_json output pdf = | |||||||
|           ~parse_content:args.jsonparsecontentstreams |           ~parse_content:args.jsonparsecontentstreams | ||||||
|           ~no_stream_data:args.jsonnostreamdata |           ~no_stream_data:args.jsonnostreamdata | ||||||
|           ~decompress_streams:args.jsondecompressstreams |           ~decompress_streams:args.jsondecompressstreams | ||||||
|  |           ~clean_strings:args.jsoncleanstrings | ||||||
|           pdf; |           pdf; | ||||||
|         close_out f |         close_out f | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										34
									
								
								cpdfjson.ml
									
									
									
									
									
								
							
							
						
						
									
										34
									
								
								cpdfjson.ml
									
									
									
									
									
								
							| @@ -23,11 +23,12 @@ Objects 1..n: The PDF's objects. | |||||||
|   o Names are written as {"N": "/Pages"} |   o Names are written as {"N": "/Pages"} | ||||||
|   o Indirect references are integers |   o Indirect references are integers | ||||||
|   o Streams are {"S": [dict, data]} |   o Streams are {"S": [dict, data]} | ||||||
|  |   o Strings are converted from UTF16BE/PDFDocEncoding to UTF8 before being | ||||||
|   o Strings are converted from PDFDocEncoding to UTF8 before being encoded in |   encoded in JSON. When they are read back the process is JSON encoded --> UTF8 | ||||||
|   JSON. When they are read back the process is JSON encoded --> UTF8 --> |   --> UTF16BE/PDFDocEncoding. This process is fully reversible: it is to allow | ||||||
|   PDFDocEncoding. This process is to allow easier editing of strings. This |   easier editing of strings. This does not happen to strings within text | ||||||
|   does not happen to strings within text operators in parsed content streams.  |   operators in parsed content streams, nor to /ID values in the | ||||||
|  |   trailerdictionary, since neither is UTF16BE/PdfDocEncoding to begin with.  | ||||||
|  |  | ||||||
| There are two subformats: parsing content streams or not.  Hello World in CPDF | There are two subformats: parsing content streams or not.  Hello World in CPDF | ||||||
| JSON without parsing content streams: | JSON without parsing content streams: | ||||||
| @@ -81,7 +82,7 @@ When parsing content streams: | |||||||
|  |  | ||||||
| CPDF currently never preserves object streams, and only outputs unencrypted files. | CPDF currently never preserves object streams, and only outputs unencrypted files. | ||||||
|  |  | ||||||
| When reloading a JSON file, CPDF knows how to correct /Length entries in | When reloading a JSON file, CPDF knows how to correct or add /Length entries in | ||||||
| streams, so you need not worry about them.  *) | streams, so you need not worry about them.  *) | ||||||
|  |  | ||||||
| open Pdfutil | open Pdfutil | ||||||
| @@ -277,12 +278,12 @@ let mkfloat f = `Assoc [("F", `Float f)] | |||||||
| let mkint i = `Assoc [("I", `Int i)] | let mkint i = `Assoc [("I", `Int i)] | ||||||
| let mkname n = `Assoc [("N", `String n)] | let mkname n = `Assoc [("N", `String n)] | ||||||
|  |  | ||||||
| let rec json_of_object pdf fcs no_stream_data pcs = function | let rec json_of_object ?(clean_strings=false) pdf fcs no_stream_data pcs = function | ||||||
|   | P.Null -> `Null |   | P.Null -> `Null | ||||||
|   | P.Boolean b -> `Bool b |   | P.Boolean b -> `Bool b | ||||||
|   | P.Integer i -> mkint i |   | P.Integer i -> mkint i | ||||||
|   | P.Real r -> mkfloat r |   | P.Real r -> mkfloat r | ||||||
|   | P.String s -> `String s |   | P.String s -> `String (if clean_strings then Pdftext.simplify_utf16be s else s) | ||||||
|   | P.Name n -> mkname n |   | P.Name n -> mkname n | ||||||
|   | P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data pcs) objs) |   | P.Array objs -> `List (map (json_of_object pdf fcs no_stream_data pcs) objs) | ||||||
|   | P.Dictionary elts -> |   | P.Dictionary elts -> | ||||||
| @@ -453,9 +454,8 @@ let precombine_page_content pdf = | |||||||
|   in |   in | ||||||
|     Pdfpage.change_pages true pdf pages' |     Pdfpage.change_pages true pdf pages' | ||||||
|  |  | ||||||
| (* FIXME make this optional? And maybe move into actual JSON reader, instead of | (* Convert any strings in UTF16BE which could actually be in PDFDocEncoding | ||||||
|    preprocessing PDF, so it helps us when writing, say, the output of |    (due to having no high bytes) to make editing JSON easier. *) | ||||||
|    -print-dict-entry? *) |  | ||||||
| let rec ppstring_single_object pdf = function | let rec ppstring_single_object pdf = function | ||||||
|   | Pdf.Dictionary d -> Pdf.recurse_dict (ppstring_single_object pdf) d |   | Pdf.Dictionary d -> Pdf.recurse_dict (ppstring_single_object pdf) d | ||||||
|   | (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) -> |   | (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) -> | ||||||
| @@ -464,16 +464,16 @@ let rec ppstring_single_object pdf = function | |||||||
|   | Pdf.String s -> Pdf.String (Pdftext.simplify_utf16be s) |   | Pdf.String s -> Pdf.String (Pdftext.simplify_utf16be s) | ||||||
|   | x -> x |   | x -> x | ||||||
|  |  | ||||||
|  | (* Do all objects, but skip the trailer dictionary since may mess up /ID if it | ||||||
|  |    happens to begin with UTF16BE BOM *) | ||||||
| let preprocess_strings pdf = | let preprocess_strings pdf = | ||||||
|     Pdf.objselfmap (ppstring_single_object pdf) pdf |     Pdf.objselfmap (ppstring_single_object pdf) pdf | ||||||
|     (* Skip the trailer dictionary since may mess up /ID if it happens to begin with UTF16BE BOM *) |  | ||||||
|     (*pdf.Pdf.trailerdict <- ppstring_single_object pdf pdf.Pdf.trailerdict*) |  | ||||||
|  |  | ||||||
| let json_of_pdf | let json_of_pdf | ||||||
|   ~parse_content ~no_stream_data ~decompress_streams |   ~parse_content ~no_stream_data ~decompress_streams ~clean_strings | ||||||
|   pdf |   pdf | ||||||
| = | = | ||||||
|   preprocess_strings pdf; |   if clean_strings then preprocess_strings pdf; | ||||||
|   let pdf = if parse_content then precombine_page_content pdf else pdf in |   let pdf = if parse_content then precombine_page_content pdf else pdf in | ||||||
|   if decompress_streams then |   if decompress_streams then | ||||||
|     Pdf.objiter (fun _ obj -> Pdfcodec.decode_pdfstream_until_unknown pdf obj) pdf; |     Pdf.objiter (fun _ obj -> Pdfcodec.decode_pdfstream_until_unknown pdf obj) pdf; | ||||||
| @@ -532,8 +532,8 @@ let json_of_pdf | |||||||
|           (fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj]) |           (fun (objnum, jsonobj) -> `List [`Int objnum; jsonobj]) | ||||||
|           pairs_parsed) |           pairs_parsed) | ||||||
|  |  | ||||||
| let to_output o ~parse_content ~no_stream_data ~decompress_streams pdf = | let to_output o ~parse_content ~no_stream_data ~decompress_streams ?(clean_strings=false) pdf = | ||||||
|   let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams pdf in |   let json = json_of_pdf ~parse_content ~no_stream_data ~decompress_streams ~clean_strings pdf in | ||||||
|     match o.Pdfio.out_caml_channel with |     match o.Pdfio.out_caml_channel with | ||||||
|     | Some ch -> J.pretty_to_channel ch json |     | Some ch -> J.pretty_to_channel ch json | ||||||
|     | None -> o.Pdfio.output_string (J.pretty_to_string json) |     | None -> o.Pdfio.output_string (J.pretty_to_string json) | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| val json_of_object : Pdf.t -> (int -> unit) -> bool -> bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t | val json_of_object : ?clean_strings:bool -> Pdf.t -> (int -> unit) -> bool -> bool -> Pdf.pdfobject -> Cpdfyojson.Safe.t | ||||||
| val object_of_json : Cpdfyojson.Safe.t -> Pdf.pdfobject | val object_of_json : Cpdfyojson.Safe.t -> Pdf.pdfobject | ||||||
| val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> Pdf.t -> unit | val to_output : Pdfio.output -> parse_content:bool -> no_stream_data:bool -> decompress_streams:bool -> ?clean_strings:bool -> Pdf.t -> unit | ||||||
| val of_input : Pdfio.input -> Pdf.t | val of_input : Pdfio.input -> Pdf.t | ||||||
|   | |||||||
							
								
								
									
										
											BIN
										
									
								
								cpdfmanual.pdf
									
									
									
									
									
								
							
							
						
						
									
										
											BIN
										
									
								
								cpdfmanual.pdf
									
									
									
									
									
								
							
										
											Binary file not shown.
										
									
								
							| @@ -1558,6 +1558,9 @@ There are two options which turn off parts of the squeezer. They are \texttt{-sq | |||||||
|   \vspace{1.5mm} |   \vspace{1.5mm} | ||||||
|   \small\noindent\verb!cpdf -add-bookmarks <bookmark file> in.pdf -o out.pdf! |   \small\noindent\verb!cpdf -add-bookmarks <bookmark file> in.pdf -o out.pdf! | ||||||
|  |  | ||||||
|  |   \vspace{1.5mm} | ||||||
|  |   \small\noindent\verb!cpdf -add-bookmarks-json <bookmark file> in.pdf -o out.pdf! | ||||||
|  |  | ||||||
|   \vspace{1.5mm} |   \vspace{1.5mm} | ||||||
|   \small\noindent\verb!cpdf -bookmarks-open-to-level <n> in.pdf -o out.pdf! |   \small\noindent\verb!cpdf -bookmarks-open-to-level <n> in.pdf -o out.pdf! | ||||||
|  |  | ||||||
| @@ -1666,6 +1669,14 @@ will be given if the bookmarks file is not in the correct form (in particular, | |||||||
| the numbers in the first column which specify the level must form a proper | the numbers in the first column which specify the level must form a proper | ||||||
| tree with no entry being more than one greater than the last). | tree with no entry being more than one greater than the last). | ||||||
|  |  | ||||||
|  | Bookmarks in JSON format (see above) may be added with \texttt{-add-bookmarks-json}: | ||||||
|  |  | ||||||
|  |   \begin{framed} | ||||||
|  |    \small\verb!cpdf -add-bookmarks-json bookmarks.json in.pdf -o out.pdf! | ||||||
|  |   \end{framed} | ||||||
|  |  | ||||||
|  | Remember that strings in JSON bookmark files are in UTF8, rather than as native PDF strings. | ||||||
|  |  | ||||||
| \section{Opening bookmarks} | \section{Opening bookmarks} | ||||||
| \index{bookmarks!opening at level} | \index{bookmarks!opening at level} | ||||||
| As an alternative to extracting a bookmark file and manipulating the open-status of bookmarks, mass manipulation may be achieved by the following operation: | As an alternative to extracting a bookmark file and manipulating the open-status of bookmarks, mass manipulation may be achieved by the following operation: | ||||||
|   | |||||||
| @@ -274,7 +274,7 @@ let print_dict_entry pdf key = | |||||||
|     match Pdf.lookup_direct pdf key d with |     match Pdf.lookup_direct pdf key d with | ||||||
|     | Some v -> |     | Some v -> | ||||||
|         (* We use a double newline as a separator. *) |         (* We use a double newline as a separator. *) | ||||||
|         Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object pdf (fun _ -> ()) false false v)); |         Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false v)); | ||||||
|         d |         d | ||||||
|     | None -> d |     | None -> d | ||||||
|   in |   in | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user