Preprocess UTF16BE strings

This commit is contained in:
John Whitington 2021-10-29 18:17:18 +01:00
parent 3142ae5251
commit ee8a10aae4
1 changed files with 31 additions and 3 deletions

View File

@ -6,12 +6,12 @@ object, one for each object in the file and two special ones:
Object -1: CPDF's own data with the PDF version number, CPDF JSON format Object -1: CPDF's own data with the PDF version number, CPDF JSON format
number, and flags used when writing (which may be required when reading): number, and flags used when writing (which may be required when reading):
o /CPDFJSONformatversion (integer, currently 2) o /CPDFJSONformatversion (CPDFJSON integer (see below), currently 2)
o /CPDFJSONcontentparsed (boolean, true if content streams have been parsed) o /CPDFJSONcontentparsed (boolean, true if content streams have been parsed)
o /CPDFJSONstreamdataincluded (boolean, true if stream data included. Cannot o /CPDFJSONstreamdataincluded (boolean, true if stream data included. Cannot
round-trip if false). round-trip if false).
o /CPDFJSONmajorpdfversion (integer) o /CPDFJSONmajorpdfversion (CPDFJSON integer)
o /CPDFJSONminorpdfversion (integer) o /CPDFJSONminorpdfversion (CPDFJSON integer)
Object 0: The PDF's trailer dictionary Object 0: The PDF's trailer dictionary
@ -24,6 +24,11 @@ Objects 1..n: The PDF's objects.
o Indirect references are integers o Indirect references are integers
o Streams are {"S": [dict, data]} o Streams are {"S": [dict, data]}
o Strings are converted from PDFDocEncoding to UTF8 before being encoded in
JSON. When they are read back the process is JSON encoded --> UTF8 -->
PDFDocEncoding. This process is to allow easier editing of strings. This
does not happen to strings within text operators in parsed content streams.
There are two subformats: parsing content streams or not. Hello World in CPDF There are two subformats: parsing content streams or not. Hello World in CPDF
JSON without parsing content streams: JSON without parsing content streams:
@ -448,10 +453,33 @@ let precombine_page_content pdf =
in in
Pdfpage.change_pages true pdf pages' Pdfpage.change_pages true pdf pages'
(* PDF strings (except /ID in the trailer dictionary) are either PDFDocEncoding
or UTF16BE. Many times the UTF16BE can all be represented in PDFDocEncoding.
In this case, there are just lots of \000 bytes getting in the way making the
JSON hard to edit. So we preprocess such simple UTF16BE strings into
PDFDocEncoding. *)
let preprocess_string s =
if Pdftext.is_unicode s
then Pdftext.pdfdocstring_of_utf8 (Pdftext.utf8_of_pdfdocstring s)
else s
let rec ppstring_single_object pdf = function
| Pdf.Dictionary d -> Pdf.recurse_dict (ppstring_single_object pdf) d
| (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) ->
Pdf.Stream {contents = (Pdf.recurse_dict (ppstring_single_object pdf) dict, data)}
| Pdf.Array a -> Pdf.recurse_array (ppstring_single_object pdf) a
| Pdf.String s -> Pdf.String (preprocess_string s)
| x -> x
let preprocess_strings pdf =
Pdf.objselfmap (ppstring_single_object pdf) pdf;
pdf.Pdf.trailerdict <- ppstring_single_object pdf pdf.Pdf.trailerdict
let json_of_pdf let json_of_pdf
~parse_content ~no_stream_data ~decompress_streams ~parse_content ~no_stream_data ~decompress_streams
pdf pdf
= =
preprocess_strings pdf;
let pdf = if parse_content then precombine_page_content pdf else pdf in let pdf = if parse_content then precombine_page_content pdf else pdf in
if decompress_streams then if decompress_streams then
Pdf.objiter (fun _ obj -> Pdfcodec.decode_pdfstream_until_unknown pdf obj) pdf; Pdf.objiter (fun _ obj -> Pdfcodec.decode_pdfstream_until_unknown pdf obj) pdf;