Streamline st/st2

This commit is contained in:
John Whitington 2024-06-20 14:50:06 +01:00
parent d812881619
commit c329895094
1 changed files with 109 additions and 117 deletions

210
cpdfua.ml
View File

@ -6,7 +6,6 @@ open Cpdferror
a) Those which require looking deep inside font files; and a) Those which require looking deep inside font files; and
b) Those which require reading inside the graphics stream. *) b) Those which require reading inside the graphics stream. *)
(* FIXME pass st / st2 around *)
(* FIXME maximise chain usage *) (* FIXME maximise chain usage *)
exception MatterhornError of Cpdfyojson.Safe.t exception MatterhornError of Cpdfyojson.Safe.t
@ -108,9 +107,6 @@ let read_st2 pdf =
let rec st_of_st2 = function E2 (a, _, cs) -> E (a, map st_of_st2 cs) let rec st_of_st2 = function E2 (a, _, cs) -> E (a, map st_of_st2 cs)
let read_st pdf =
st_of_st2 (read_st2 pdf)
let rec st_mem p = function let rec st_mem p = function
| E (s, _) when p s -> true | E (s, _) when p s -> true
| E (_, cs) -> List.exists (st_mem p) cs | E (_, cs) -> List.exists (st_mem p) cs
@ -120,16 +116,16 @@ let string_of_st st =
Cpdfyojson.Safe.pretty_to_string (convert st) Cpdfyojson.Safe.pretty_to_string (convert st)
(* Content marked as Artifact is present inside tagged content. *) (* Content marked as Artifact is present inside tagged content. *)
let matterhorn_01_003 pdf = todo () let matterhorn_01_003 _ _ pdf = todo ()
(* Tagged content is present inside content marked as Artifact. *) (* Tagged content is present inside content marked as Artifact. *)
let matterhorn_01_004 pdf = todo () let matterhorn_01_004 _ _ pdf = todo ()
(* Content is neither marked as Artifact nor tagged as real content. *) (* Content is neither marked as Artifact nor tagged as real content. *)
let matterhorn_01_005 pdf = todo () let matterhorn_01_005 _ _ pdf = todo ()
(* Suspects entry has a value of true. *) (* Suspects entry has a value of true. *)
let matterhorn_01_007 pdf = let matterhorn_01_007 _ _ pdf =
match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/MarkInfo"; "/Suspects"] with match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/MarkInfo"; "/Suspects"] with
| Some (Pdf.Boolean true) -> merror () | Some (Pdf.Boolean true) -> merror ()
| _ -> () | _ -> ()
@ -161,7 +157,7 @@ let circular rm =
(* One or more non-standard tags mapping does not terminate with a standard (* One or more non-standard tags mapping does not terminate with a standard
type. *) type. *)
let matterhorn_02_001 pdf = let matterhorn_02_001 _ _ pdf =
match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/RoleMap"] with match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/RoleMap"] with
| Some rm -> | Some rm ->
let rolemap = read_rolemap pdf rm in let rolemap = read_rolemap pdf rm in
@ -170,7 +166,7 @@ let matterhorn_02_001 pdf =
| None -> () | None -> ()
(* A circular mapping exists. *) (* A circular mapping exists. *)
let matterhorn_02_003 pdf = let matterhorn_02_003 _ _ pdf =
match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/RoleMap"] with match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/RoleMap"] with
| Some rm -> | Some rm ->
let rolemap = read_rolemap pdf rm in let rolemap = read_rolemap pdf rm in
@ -178,7 +174,7 @@ let matterhorn_02_003 pdf =
| None -> () | None -> ()
(* One or more standard types are remapped. *) (* One or more standard types are remapped. *)
let matterhorn_02_004 pdf = let matterhorn_02_004 _ _ pdf =
match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/RoleMap"] with match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/RoleMap"] with
| Some rm -> | Some rm ->
let rolemap = read_rolemap pdf rm in let rolemap = read_rolemap pdf rm in
@ -186,14 +182,14 @@ let matterhorn_02_004 pdf =
| None -> () | None -> ()
(* Document does not contain an XMP metadata stream *) (* Document does not contain an XMP metadata stream *)
let matterhorn_06_001 pdf = let matterhorn_06_001 _ _ pdf =
match Cpdfmetadata.get_metadata pdf with match Cpdfmetadata.get_metadata pdf with
| Some _ -> () | Some _ -> ()
| None -> merror () | None -> merror ()
(* The XMP metadata stream in the Catalog dictionary does not include the (* The XMP metadata stream in the Catalog dictionary does not include the
PDF/UA identifier. *) PDF/UA identifier. *)
let matterhorn_06_002 pdf = let matterhorn_06_002 _ _ pdf =
match Cpdfmetadata.get_metadata pdf with match Cpdfmetadata.get_metadata pdf with
| Some metadata -> | Some metadata ->
let _, tree = Cpdfmetadata.xmltree_of_bytes metadata in let _, tree = Cpdfmetadata.xmltree_of_bytes metadata in
@ -204,7 +200,7 @@ let matterhorn_06_002 pdf =
| None -> () (* case covered by test 06_001 above, no need for two failures *) | None -> () (* case covered by test 06_001 above, no need for two failures *)
(* XMP metadata stream does not contain dc:title *) (* XMP metadata stream does not contain dc:title *)
let matterhorn_06_003 pdf = let matterhorn_06_003 _ _ pdf =
match Cpdfmetadata.get_metadata pdf with match Cpdfmetadata.get_metadata pdf with
| Some metadata -> | Some metadata ->
let _, tree = Cpdfmetadata.xmltree_of_bytes metadata in let _, tree = Cpdfmetadata.xmltree_of_bytes metadata in
@ -216,7 +212,7 @@ let matterhorn_06_003 pdf =
(* ViewerPreferences dictionary of the Catalog dictionary does not contain a (* ViewerPreferences dictionary of the Catalog dictionary does not contain a
DisplayDocTitle entry. *) DisplayDocTitle entry. *)
let matterhorn_07_001 pdf = let matterhorn_07_001 _ _ pdf =
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some catalog -> | Some catalog ->
begin match Pdf.lookup_direct pdf "/ViewerPreferences" catalog with begin match Pdf.lookup_direct pdf "/ViewerPreferences" catalog with
@ -231,7 +227,7 @@ let matterhorn_07_001 pdf =
(* ViewerPreferences dictionary of the Catalog dictionary contains a (* ViewerPreferences dictionary of the Catalog dictionary contains a
DisplayDocTitle entry with a value of false. *) DisplayDocTitle entry with a value of false. *)
let matterhorn_07_002 pdf = let matterhorn_07_002 _ _ pdf =
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some catalog -> | Some catalog ->
begin match Pdf.lookup_direct pdf "/ViewerPreferences" catalog with begin match Pdf.lookup_direct pdf "/ViewerPreferences" catalog with
@ -247,7 +243,7 @@ let matterhorn_07_002 pdf =
(* A table-related structure element is used in a way that does not conform to (* A table-related structure element is used in a way that does not conform to
the syntax defined in ISO 32000-1, Table 337. We assume no nesting of whole the syntax defined in ISO 32000-1, Table 337. We assume no nesting of whole
tables, since it is not excplicitly mentioned in the spec. *) tables, since it is not excplicitly mentioned in the spec. *)
let matterhorn_09_004 pdf = let matterhorn_09_004 st st2 pdf =
let rec check_table = function let rec check_table = function
| E ("/Table", cs) -> | E ("/Table", cs) ->
let cs = let cs =
@ -286,11 +282,11 @@ let matterhorn_09_004 pdf =
if List.exists (function E ("/TR", _) -> false | _ -> true) node then if List.exists (function E ("/TR", _) -> false | _ -> true) node then
merror_str "Element in /THead | /TBody | /TFoot not a /TR" merror_str "Element in /THead | /TBody | /TFoot not a /TR"
in in
check_table (read_st pdf) check_table st
(* A list-related structure element is used in a way that does not conform to (* A list-related structure element is used in a way that does not conform to
Table 336 in ISO 32000-1. *) Table 336 in ISO 32000-1. *)
let matterhorn_09_005 pdf = let matterhorn_09_005 st st2 pdf =
let rec check_l = function let rec check_l = function
| E ("/L", cs) -> | E ("/L", cs) ->
(* 0 or 1 captions *) (* 0 or 1 captions *)
@ -312,15 +308,14 @@ let matterhorn_09_005 pdf =
| E (("/LBody"| "/Lbl"), cs) -> iter check_l cs | E (("/LBody"| "/Lbl"), cs) -> iter check_l cs
| E (_, _) -> merror_str "Child of /LI must be /Lbl or /LBody" | E (_, _) -> merror_str "Child of /LI must be /Lbl or /LBody"
in in
check_l (read_st pdf) check_l st
(* A TOC-related structure element is used in a way that does not conform to (* A TOC-related structure element is used in a way that does not conform to
Table 333 in ISO 32000-1. *) Table 333 in ISO 32000-1. *)
(* We test two things: a) everything under a TOC is correct; and b) There is no (* We test two things: a) everything under a TOC is correct; and b) There is no
TOCI except under a TOC. *) TOCI except under a TOC. *)
let matterhorn_09_006 pdf = let matterhorn_09_006 st st2 pdf =
let st = read_st pdf in
let seen_toc = ref false in let seen_toc = ref false in
let rec check_toplevel_TOCI n = let rec check_toplevel_TOCI n =
begin match n with begin match n with
@ -353,8 +348,7 @@ let matterhorn_09_006 pdf =
(* A Ruby-related structure element is used in a way that does not conform to (* A Ruby-related structure element is used in a way that does not conform to
Table 338 in ISO 32000-1. *) Table 338 in ISO 32000-1. *)
let matterhorn_09_007 pdf = let matterhorn_09_007 st st2 pdf =
let st = read_st pdf in
let rec check_ruby = function let rec check_ruby = function
| E ("/Ruby", cs) -> | E ("/Ruby", cs) ->
if List.exists (function (E (("/RB" | "/RT" | "RP"), _)) -> false | _ -> true) cs then merror () if List.exists (function (E (("/RB" | "/RT" | "RP"), _)) -> false | _ -> true) cs then merror ()
@ -365,8 +359,7 @@ let matterhorn_09_007 pdf =
(* A Warichu-related structure element is used in a way that does not conform (* A Warichu-related structure element is used in a way that does not conform
to Table 338 in ISO 32000-1. *) to Table 338 in ISO 32000-1. *)
let matterhorn_09_008 pdf = let matterhorn_09_008 st st2 pdf =
let st = read_st pdf in
let rec check_warichu = function let rec check_warichu = function
| E ("/Ruby", cs) -> | E ("/Ruby", cs) ->
if List.exists (function (E (("/WT" | "/WP"), _)) -> false | _ -> true) cs then merror () if List.exists (function (E (("/WT" | "/WP"), _)) -> false | _ -> true) cs then merror ()
@ -376,32 +369,32 @@ let matterhorn_09_008 pdf =
check_warichu st check_warichu st
(* Character code cannot be mapped to Unicode. *) (* Character code cannot be mapped to Unicode. *)
let matterhorn_10_001 pdf = let matterhorn_10_001 _ _ pdf =
unimpl () unimpl ()
(* Natural language for text in page content cannot be determined. *) (* Natural language for text in page content cannot be determined. *)
let matterhorn_11_001 pdf = let matterhorn_11_001 _ _ pdf =
unimpl () unimpl ()
(* Natural language for text in Alt, ActualText and E attributes cannot be (* Natural language for text in Alt, ActualText and E attributes cannot be
determined. *) determined. *)
let matterhorn_11_002 pdf = todo () let matterhorn_11_002 _ _ pdf = todo ()
(* Natural language in the Outline entries cannot be determined. *) (* Natural language in the Outline entries cannot be determined. *)
let matterhorn_11_003 pdf = todo () let matterhorn_11_003 _ _ pdf = todo ()
(* Natural language in the Contents entry for annotations cannot be determined. (* Natural language in the Contents entry for annotations cannot be determined.
*) *)
let matterhorn_11_004 pdf = todo () let matterhorn_11_004 _ _ pdf = todo ()
(* Natural language in the TU entry for form fields cannot be determined. *) (* Natural language in the TU entry for form fields cannot be determined. *)
let matterhorn_11_005 pdf = todo () let matterhorn_11_005 _ _ pdf = todo ()
(* Natural language for document metadata cannot be determined. *) (* Natural language for document metadata cannot be determined. *)
let matterhorn_11_006 pdf = todo () let matterhorn_11_006 _ _ pdf = todo ()
(* <Figure> tag alternative or replacement text missing. *) (* <Figure> tag alternative or replacement text missing. *)
let matterhorn_13_004 pdf = todo () let matterhorn_13_004 _ _ pdf = todo ()
let is_hnum s = let is_hnum s =
match explode s with match explode s with
@ -411,18 +404,17 @@ let is_hnum s =
| _ -> false | _ -> false
(* Does use numbered headings, but the first heading tag is not <H1>. *) (* Does use numbered headings, but the first heading tag is not <H1>. *)
let matterhorn_14_002 pdf = let matterhorn_14_002 st st2 pdf =
let rec check_hn = function let rec check_hn = function
| E ("/H1", cs) -> () | E ("/H1", cs) -> ()
| E (s, cs) when is_hnum s -> merror () | E (s, cs) when is_hnum s -> merror ()
| E (_, cs) -> iter check_hn cs | E (_, cs) -> iter check_hn cs
in in
let st = read_st pdf in
check_hn st check_hn st
(* Numbered heading levels in descending sequence are skipped (Example: <H3> (* Numbered heading levels in descending sequence are skipped (Example: <H3>
follows directly after <H1>). *) follows directly after <H1>). *)
let matterhorn_14_003 pdf = let matterhorn_14_003 st st2 pdf =
let rec check_nseq n = function let rec check_nseq n = function
| E (s, cs) when is_hnum s -> | E (s, cs) when is_hnum s ->
let num = int_of_string (implode (tl (tl (explode s)))) in let num = int_of_string (implode (tl (tl (explode s)))) in
@ -430,11 +422,10 @@ let matterhorn_14_003 pdf =
iter (check_nseq num) cs iter (check_nseq num) cs
| E (_, cs) -> iter (check_nseq n) cs | E (_, cs) -> iter (check_nseq n) cs
in in
check_nseq 0 (read_st pdf) check_nseq 0 st
(* A node contains more than one <H> tag. *) (* A node contains more than one <H> tag. *)
let matterhorn_14_006 pdf = let matterhorn_14_006 st st2 pdf =
let st = read_st pdf in
let found = ref false in let found = ref false in
let rec check_hs (E (_, cs)) = let rec check_hs (E (_, cs)) =
if length (option_map (function E ("/H", _) -> Some () | _ -> None) cs) > 1 then set found; if length (option_map (function E ("/H", _) -> Some () | _ -> None) cs) > 1 then set found;
@ -444,13 +435,12 @@ let matterhorn_14_006 pdf =
if !found then merror () if !found then merror ()
(* Document uses both <H> and <H#> tags. *) (* Document uses both <H> and <H#> tags. *)
let matterhorn_14_007 pdf = let matterhorn_14_007 st st2 pdf =
let st = read_st pdf in
if st_mem (eq "/H") st && st_mem is_hnum st then merror () if st_mem (eq "/H") st && st_mem is_hnum st then merror ()
(* In a table not organized with Headers attributes and IDs, a <TH> cell does (* In a table not organized with Headers attributes and IDs, a <TH> cell does
not contain a Scope attribute. *) not contain a Scope attribute. *)
let matterhorn_15_003 pdf = let matterhorn_15_003 st st2 pdf =
(* For now, we complain any time a <TH> does not have a scope. The 2008 PDF (* For now, we complain any time a <TH> does not have a scope. The 2008 PDF
spec, 2014 PDF/UA spec and Matterhorn protocol combined do not quite give spec, 2014 PDF/UA spec and Matterhorn protocol combined do not quite give
enough information to know what is required. To be returned to. *) enough information to know what is required. To be returned to. *)
@ -459,32 +449,32 @@ let matterhorn_15_003 pdf =
if not (List.mem "/Scope" attr) then merror_str "No scope, table organization not checked." (*else Printf.printf "Found /Scope in /TH\n"*) if not (List.mem "/Scope" attr) then merror_str "No scope, table organization not checked." (*else Printf.printf "Found /Scope in /TH\n"*)
| E2 (_, _, cs) -> iter check_th cs | E2 (_, _, cs) -> iter check_th cs
in in
check_th (read_st2 pdf) check_th st2
(* <Formula> tag is missing an Alt attribute. *) (* <Formula> tag is missing an Alt attribute. *)
let matterhorn_17_002 pdf = let matterhorn_17_002 _ st2 pdf =
let rec check_fm = function let rec check_fm = function
| E2 ("/Formula", attr, _) -> | E2 ("/Formula", attr, _) ->
if not (List.mem "/Alt" attr) then merror () if not (List.mem "/Alt" attr) then merror ()
| E2 (_, _, cs) -> iter check_fm cs | E2 (_, _, cs) -> iter check_fm cs
in in
check_fm (read_st2 pdf) check_fm st2
(* Unicode mapping requirements are not met. *) (* Unicode mapping requirements are not met. *)
let matterhorn_17_003 pdf = let matterhorn_17_003 _ _ pdf =
unimpl () unimpl ()
(* ID entry of the <Note> tag is not present. *) (* ID entry of the <Note> tag is not present. *)
let matterhorn_19_003 pdf = let matterhorn_19_003 st st2 pdf =
let rec check_note = function let rec check_note = function
| E2 ("/Note", attr, _) -> | E2 ("/Note", attr, _) ->
if not (List.mem "/ID" attr) then merror () if not (List.mem "/ID" attr) then merror ()
| E2 (_, _, cs) -> iter check_note cs | E2 (_, _, cs) -> iter check_note cs
in in
check_note (read_st2 pdf) check_note st2
(* ID entry of the <Note> tag is non-unique. *) (* ID entry of the <Note> tag is non-unique. *)
let matterhorn_19_004 pdf = let matterhorn_19_004 _ _ pdf =
(* Looking for /Type /StructElem /N /Note /ID to exist. *) (* Looking for /Type /StructElem /N /Note /ID to exist. *)
(* FIXME ClassMaps here? *) (* FIXME ClassMaps here? *)
let ids = ref [] in let ids = ref [] in
@ -494,13 +484,12 @@ let matterhorn_19_004 pdf =
| Some (Pdf.Name "/StructElem"), Some (Pdf.Name "/Note"), Some (Pdf.String s) -> ids := s::!ids | Some (Pdf.Name "/StructElem"), Some (Pdf.Name "/Note"), Some (Pdf.String s) -> ids := s::!ids
| _ -> ()) | _ -> ())
pdf; pdf;
(* FIXME should write the big setify here, fix up in CamlPDF - this is a good example. *) if length (setify_large !ids) < length !ids then merror ()
if length (setify !ids) < length !ids then merror ()
(* Name entry is missing or has an empty string as its value in an Optional (* Name entry is missing or has an empty string as its value in an Optional
Content Configuration Dictionary in the Configs entry in the OCProperties Content Configuration Dictionary in the Configs entry in the OCProperties
entry in the Catalog dictionary. *) entry in the Catalog dictionary. *)
let matterhorn_20_001 pdf = let matterhorn_20_001 _ _ pdf =
match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/OCProperties"; "/Configs"] with match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/OCProperties"; "/Configs"] with
| Some (Pdf.Array occds) -> | Some (Pdf.Array occds) ->
iter (function x -> match Pdf.lookup_direct pdf "/Name" x with None | Some (Pdf.Name "") -> merror () | _ -> ()) occds iter (function x -> match Pdf.lookup_direct pdf "/Name" x with None | Some (Pdf.Name "") -> merror () | _ -> ()) occds
@ -509,13 +498,13 @@ let matterhorn_20_001 pdf =
(* Name entry is missing or has an empty string as its value in an Optional (* Name entry is missing or has an empty string as its value in an Optional
Content Configuration Dictionary that is the value of the D entry in the Content Configuration Dictionary that is the value of the D entry in the
OCProperties entry in the Catalog dictionary. *) OCProperties entry in the Catalog dictionary. *)
let matterhorn_20_002 pdf = let matterhorn_20_002 _ _ pdf =
match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/OCProperties"; "/D"; "/Name"] with match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/OCProperties"; "/D"; "/Name"] with
| Some (Pdf.String "") | None -> merror () | Some (Pdf.String "") | None -> merror ()
| _ -> () | _ -> ()
(* An AS entry appears in an Optional Content Configuration Dictionary. *) (* An AS entry appears in an Optional Content Configuration Dictionary. *)
let matterhorn_20_003 pdf = let matterhorn_20_003 _ _ pdf =
begin match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/OCProperties"; "/D"; "/AS"] with begin match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/OCProperties"; "/D"; "/AS"] with
| Some _ -> merror () | Some _ -> merror ()
| _ -> () | _ -> ()
@ -528,7 +517,7 @@ let matterhorn_20_003 pdf =
(* The file specification dictionary for an embedded file does not contain F (* The file specification dictionary for an embedded file does not contain F
and UF entries. *) and UF entries. *)
let matterhorn_21_001 pdf = let matterhorn_21_001 _ _ pdf =
let from_nametree = let from_nametree =
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some catalog -> | Some catalog ->
@ -562,7 +551,7 @@ let matterhorn_21_001 pdf =
merror () merror ()
(* File contains the dynamicRender element with value “required”. *) (* File contains the dynamicRender element with value “required”. *)
let matterhorn_25_001 pdf = let matterhorn_25_001 _ _ pdf =
let rec contains_required_dynamicRender = function let rec contains_required_dynamicRender = function
| Cpdfmetadata.E (((_, "dynamicRender"), _), [Cpdfmetadata.D "required"]) -> true | Cpdfmetadata.E (((_, "dynamicRender"), _), [Cpdfmetadata.D "required"]) -> true
| Cpdfmetadata.E (_, children) -> List.exists contains_required_dynamicRender children | Cpdfmetadata.E (_, children) -> List.exists contains_required_dynamicRender children
@ -597,13 +586,13 @@ let matterhorn_25_001 pdf =
(* The file is encrypted but does not contain a P entry in its encryption (* The file is encrypted but does not contain a P entry in its encryption
dictionary. *) dictionary. *)
let matterhorn_26_001 pdf = () let matterhorn_26_001 _ _ pdf = ()
(* Would already have failed at this point, because CamlPDF does not allow (* Would already have failed at this point, because CamlPDF does not allow
the decryption of a file with no /P *) the decryption of a file with no /P. FIXME Can we make it fail with the right message? *)
(* The file is encrypted and does contain a P entry but the 10th bit position (* The file is encrypted and does contain a P entry but the 10th bit position
of the P entry is false. *) of the P entry is false. *)
let matterhorn_26_002 pdf = let matterhorn_26_002 _ _ pdf =
match pdf.Pdf.saved_encryption with match pdf.Pdf.saved_encryption with
| None -> () | None -> ()
| Some {Pdf.from_get_encryption_values = (_, _, _, p, _, _, _)} -> | Some {Pdf.from_get_encryption_values = (_, _, _, p, _, _, _)} ->
@ -611,20 +600,20 @@ let matterhorn_26_002 pdf =
(* An annotation, other than of subtype Widget, Link and PrinterMark, is not a (* An annotation, other than of subtype Widget, Link and PrinterMark, is not a
direct child of an <Annot> structure element. *) direct child of an <Annot> structure element. *)
let matterhorn_28_002 pdf = todo () let matterhorn_28_002 _ _ pdf = todo ()
(* An annotation, other than of subtype Widget, does not have a Contents entry (* An annotation, other than of subtype Widget, does not have a Contents entry
and does not have an alternative description (in the form of an Alt entry in and does not have an alternative description (in the form of an Alt entry in
the enclosing structure element). *) the enclosing structure element). *)
let matterhorn_28_004 pdf = todo () let matterhorn_28_004 _ _ pdf = todo ()
(* A form field does not have a TU entry and does not have an alternative (* A form field does not have a TU entry and does not have an alternative
description (in the form of an Alt entry in the enclosing structure description (in the form of an Alt entry in the enclosing structure
element). *) element). *)
let matterhorn_28_005 pdf = todo () let matterhorn_28_005 _ _ pdf = todo ()
(* An annotation with subtype undefined in ISO 32000 does not meet 7.18.1. *) (* An annotation with subtype undefined in ISO 32000 does not meet 7.18.1. *)
let matterhorn_28_006 pdf = let matterhorn_28_006 _ _ pdf =
if if
List.exists List.exists
(fun x -> match x.Pdfannot.subtype with Pdfannot.Unknown _ -> true | _ -> false) (fun x -> match x.Pdfannot.subtype with Pdfannot.Unknown _ -> true | _ -> false)
@ -633,7 +622,7 @@ let matterhorn_28_006 pdf =
merror () merror ()
(* An annotation of subtype TrapNet exists. *) (* An annotation of subtype TrapNet exists. *)
let matterhorn_28_007 pdf = let matterhorn_28_007 _ _ pdf =
if if
List.exists List.exists
(fun x -> x.Pdfannot.subtype = Pdfannot.TrapNet) (fun x -> x.Pdfannot.subtype = Pdfannot.TrapNet)
@ -642,7 +631,7 @@ let matterhorn_28_007 pdf =
merror () merror ()
(* A page containing an annotation does not contain a Tabs entry *) (* A page containing an annotation does not contain a Tabs entry *)
let matterhorn_28_008 pdf = let matterhorn_28_008 _ _ pdf =
if if
List.exists List.exists
(fun p -> (fun p ->
@ -653,7 +642,7 @@ let matterhorn_28_008 pdf =
(* A page containing an annotation has a Tabs entry with a value other than S. (* A page containing an annotation has a Tabs entry with a value other than S.
*) *)
let matterhorn_28_009 pdf = let matterhorn_28_009 _ _ pdf =
if if
List.exists List.exists
(fun p -> (fun p ->
@ -665,14 +654,14 @@ let matterhorn_28_009 pdf =
merror () merror ()
(* A widget annotation is not nested within a <Form> tag. *) (* A widget annotation is not nested within a <Form> tag. *)
let matterhorn_28_010 pdf = todo () let matterhorn_28_010 _ _ pdf = todo ()
(* A link annotation is not nested within a <Link> tag. *) (* A link annotation is not nested within a <Link> tag. *)
let matterhorn_28_011 pdf = todo () let matterhorn_28_011 _ _ pdf = todo ()
(* A link annotation does not include an alternate description in its Contents (* A link annotation does not include an alternate description in its Contents
entry. *) entry. *)
let matterhorn_28_012 pdf = let matterhorn_28_012 _ _ pdf =
if if
List.exists List.exists
(fun x -> x.Pdfannot.subtype = Pdfannot.Link && x.Pdfannot.annot_contents = None ) (fun x -> x.Pdfannot.subtype = Pdfannot.Link && x.Pdfannot.annot_contents = None )
@ -681,7 +670,7 @@ let matterhorn_28_012 pdf =
merror () merror ()
(* CT entry is missing from the media clip data dictionary. *) (* CT entry is missing from the media clip data dictionary. *)
let matterhorn_28_014 pdf = let matterhorn_28_014 _ _ pdf =
Pdf.objiter Pdf.objiter
(fun _ o -> (fun _ o ->
match Pdf.lookup_direct pdf "/Type" o, Pdf.lookup_direct pdf "/S" o, Pdf.lookup_direct pdf "/CT" o with match Pdf.lookup_direct pdf "/Type" o, Pdf.lookup_direct pdf "/S" o, Pdf.lookup_direct pdf "/CT" o with
@ -690,7 +679,7 @@ let matterhorn_28_014 pdf =
pdf pdf
(* Alt entry is missing from the media clip data dictionary. *) (* Alt entry is missing from the media clip data dictionary. *)
let matterhorn_28_015 pdf = let matterhorn_28_015 _ _ pdf =
Pdf.objiter Pdf.objiter
(fun _ o -> (fun _ o ->
match Pdf.lookup_direct pdf "/Type" o, Pdf.lookup_direct pdf "/S" o, Pdf.lookup_direct pdf "/CT" o with match Pdf.lookup_direct pdf "/Type" o, Pdf.lookup_direct pdf "/S" o, Pdf.lookup_direct pdf "/CT" o with
@ -699,18 +688,19 @@ let matterhorn_28_015 pdf =
pdf pdf
(* File attachment annotations do not conform to 7.11. *) (* File attachment annotations do not conform to 7.11. *)
let matterhorn_28_016 pdf = let matterhorn_28_016 _ _ pdf =
(* FIXME ?? *)
covered_elsewhere () covered_elsewhere ()
(* A PrinterMark annotation is included in the logical structure. *) (* A PrinterMark annotation is included in the logical structure. *)
let matterhorn_28_017 pdf = todo () let matterhorn_28_017 _ _ pdf = todo ()
(* The appearance stream of a PrinterMark annotation is not marked as Artifact. (* The appearance stream of a PrinterMark annotation is not marked as Artifact.
*) *)
let matterhorn_28_018 pdf = todo () let matterhorn_28_018 _ _ pdf = todo ()
(* A reference XObject is present. *) (* A reference XObject is present. *)
let matterhorn_30_001 pdf = let matterhorn_30_001 _ _ pdf =
Pdf.objiter Pdf.objiter
(fun _ o -> (fun _ o ->
match Pdf.lookup_direct pdf "/Subtype" o, Pdf.lookup_direct pdf "/Ref" o with match Pdf.lookup_direct pdf "/Subtype" o, Pdf.lookup_direct pdf "/Ref" o with
@ -719,7 +709,7 @@ let matterhorn_30_001 pdf =
pdf pdf
(* Form XObject contains MCIDs and is referenced more than once. *) (* Form XObject contains MCIDs and is referenced more than once. *)
let matterhorn_30_002 pdf = let matterhorn_30_002 _ _ pdf =
(* We need to consider inheritence here. What solutions do we already have (* We need to consider inheritence here. What solutions do we already have
for that, and do we need anything new? *) for that, and do we need anything new? *)
unimpl () unimpl ()
@ -727,7 +717,7 @@ let matterhorn_30_002 pdf =
(* A Type 0 font dictionary with encoding other than Identity-H and Identity-V (* A Type 0 font dictionary with encoding other than Identity-H and Identity-V
has values for Registry in both CIDSystemInfo dictionaries that are not has values for Registry in both CIDSystemInfo dictionaries that are not
identical. *) identical. *)
let matterhorn_31_001 pdf = let matterhorn_31_001 _ _ pdf =
Pdf.objiter Pdf.objiter
(fun _ o -> (fun _ o ->
match Pdf.lookup_direct pdf "/Subtype" o, Pdf.lookup_direct pdf "/Encoding" o with match Pdf.lookup_direct pdf "/Subtype" o, Pdf.lookup_direct pdf "/Encoding" o with
@ -742,19 +732,19 @@ let matterhorn_31_001 pdf =
(* A Type 0 font dictionary with encoding other than Identity-H and Identity-V (* A Type 0 font dictionary with encoding other than Identity-H and Identity-V
has values for Ordering in both CIDSystemInfo dictionaries that are not has values for Ordering in both CIDSystemInfo dictionaries that are not
identical. *) identical. *)
let matterhorn_31_002 pdf = let matterhorn_31_002 st st2 pdf =
matterhorn_31_001 pdf matterhorn_31_001 st st2 pdf
(* A Type 0 font dictionary with encoding other than Identity-H and Identity-V (* A Type 0 font dictionary with encoding other than Identity-H and Identity-V
has a value for Supplement in the CIDSystemInfo dictionary of the CID font has a value for Supplement in the CIDSystemInfo dictionary of the CID font
that is less than the value for Supplement in the CIDSystemInfo dictionary that is less than the value for Supplement in the CIDSystemInfo dictionary
of the CMap. *) of the CMap. *)
let matterhorn_31_003 pdf = let matterhorn_31_003 st st2 pdf =
matterhorn_31_001 pdf matterhorn_31_001 st st2 pdf
(* A Type 2 CID font contains neither a stream nor the name Identity as the (* A Type 2 CID font contains neither a stream nor the name Identity as the
value of the CIDToGIDMap entry. *) value of the CIDToGIDMap entry. *)
let matterhorn_31_004 pdf = let matterhorn_31_004 _ _ pdf =
Pdf.objiter Pdf.objiter
(fun _ n -> (fun _ n ->
match Pdf.lookup_direct pdf "/Subtype" n with match Pdf.lookup_direct pdf "/Subtype" n with
@ -767,7 +757,7 @@ let matterhorn_31_004 pdf =
pdf pdf
(* A Type 2 CID font does not contain a CIDToGIDMap entry. *) (* A Type 2 CID font does not contain a CIDToGIDMap entry. *)
let matterhorn_31_005 pdf = let matterhorn_31_005 _ _ pdf =
Pdf.objiter Pdf.objiter
(fun _ n -> (fun _ n ->
match Pdf.lookup_direct pdf "/Subtype" n with match Pdf.lookup_direct pdf "/Subtype" n with
@ -842,7 +832,7 @@ let cmap_names =
"/Identity-H"; "/Identity-H";
"/Identity-V"] "/Identity-V"]
let matterhorn_31_006 pdf = let matterhorn_31_006 _ _ pdf =
Pdf.objiter Pdf.objiter
(fun _ o -> (fun _ o ->
match Pdf.lookup_direct pdf "/Subtype" o with match Pdf.lookup_direct pdf "/Subtype" o with
@ -856,65 +846,65 @@ let matterhorn_31_006 pdf =
(* The WMode entry in a CMap dictionary is not identical to the WMode value in (* The WMode entry in a CMap dictionary is not identical to the WMode value in
the CMap stream. *) the CMap stream. *)
let matterhorn_31_007 pdf = let matterhorn_31_007 _ _ pdf =
unimpl () unimpl ()
(* A CMap references another CMap which is not listed in ISO 32000-1:2008, (* A CMap references another CMap which is not listed in ISO 32000-1:2008,
9.7.5.2, Table 118. *) 9.7.5.2, Table 118. *)
let matterhorn_31_008 pdf = let matterhorn_31_008 _ _ pdf =
unimpl () unimpl ()
(* For a font used by text intended to be rendered the font program is not (* For a font used by text intended to be rendered the font program is not
embedded. *) embedded. *)
(* NB This, for now, reports all unembedded fonts. *) (* NB This, for now, reports all unembedded fonts. *)
let matterhorn_31_009 pdf = let matterhorn_31_009 _ _ pdf =
let l = Cpdffont.missing_fonts_return pdf (ilist 1 (Pdfpage.endpage pdf)) in let l = Cpdffont.missing_fonts_return pdf (ilist 1 (Pdfpage.endpage pdf)) in
if l <> [] then if l <> [] then
raise (MatterhornError (`List (map (fun x -> `String x) l))) raise (MatterhornError (`List (map (fun x -> `String x) l)))
(* For a font used by text the font program is embedded but it does not contain (* For a font used by text the font program is embedded but it does not contain
glyphs for all of the glyphs referenced by the text used for rendering. *) glyphs for all of the glyphs referenced by the text used for rendering. *)
let matterhorn_31_011 pdf = let matterhorn_31_011 _ _ pdf =
unimpl () unimpl ()
(* The FontDescriptor dictionary of an embedded Type 1 font contains a CharSet (* The FontDescriptor dictionary of an embedded Type 1 font contains a CharSet
string, but at least one of the glyphs present in the font program is not string, but at least one of the glyphs present in the font program is not
listed in the CharSet string. *) listed in the CharSet string. *)
let matterhorn_31_012 pdf = let matterhorn_31_012 _ _ pdf =
unimpl () unimpl ()
(* The FontDescriptor dictionary of an embedded Type 1 font contains a CharSet (* The FontDescriptor dictionary of an embedded Type 1 font contains a CharSet
string, but at least one of the glyphs listed in the CharSet string is not string, but at least one of the glyphs listed in the CharSet string is not
present in the font program. *) present in the font program. *)
let matterhorn_31_013 pdf = let matterhorn_31_013 _ _ pdf =
unimpl () unimpl ()
(* The FontDescriptor dictionary of an embedded CID font contains a CIDSet (* The FontDescriptor dictionary of an embedded CID font contains a CIDSet
string, but at least one of the glyphs present in the font program is not string, but at least one of the glyphs present in the font program is not
listed in the CIDSet string. *) listed in the CIDSet string. *)
let matterhorn_31_014 pdf = let matterhorn_31_014 _ _ pdf =
unimpl () unimpl ()
(* The FontDescriptor dictionary of an embedded CID font contains a CIDSet (* The FontDescriptor dictionary of an embedded CID font contains a CIDSet
string, but at least one of the glyphs listed in the CIDSet string is not string, but at least one of the glyphs listed in the CIDSet string is not
present in the font program. *) present in the font program. *)
let matterhorn_31_015 pdf = let matterhorn_31_015 _ _ pdf =
unimpl () unimpl ()
(* For one or more glyphs, the glyph width information in the font dictionary (* For one or more glyphs, the glyph width information in the font dictionary
and in the embedded font program differ by more than 1/1000 unit. *) and in the embedded font program differ by more than 1/1000 unit. *)
let matterhorn_31_016 pdf = let matterhorn_31_016 _ _ pdf =
unimpl () unimpl ()
(* A non-symbolic TrueType font is used for rendering, but none of the cmap (* A non-symbolic TrueType font is used for rendering, but none of the cmap
entries in the embedded font program is a non-symbolic cmap. *) entries in the embedded font program is a non-symbolic cmap. *)
let matterhorn_31_017 pdf = let matterhorn_31_017 _ _ pdf =
unimpl () unimpl ()
(* A non-symbolic TrueType font is used for rendering, but for at least one (* A non-symbolic TrueType font is used for rendering, but for at least one
glyph to be rendered the glyph cannot be looked up by any of the glyph to be rendered the glyph cannot be looked up by any of the
non-symbolic cmap entries in the embedded font program. *) non-symbolic cmap entries in the embedded font program. *)
let matterhorn_31_018 pdf = let matterhorn_31_018 _ _ pdf =
unimpl () unimpl ()
(* The font dictionary for a non-symbolic TrueType font does not contain an (* The font dictionary for a non-symbolic TrueType font does not contain an
@ -928,7 +918,7 @@ let is_non_symbolic pdf o =
end end
| None -> true | None -> true
let matterhorn_31_019 pdf = let matterhorn_31_019 _ _ pdf =
Pdf.objiter Pdf.objiter
(fun _ o -> (fun _ o ->
match Pdf.lookup_direct pdf "/Subtype" o with match Pdf.lookup_direct pdf "/Subtype" o with
@ -942,7 +932,7 @@ let matterhorn_31_019 pdf =
(* The font dictionary for a non-symbolic TrueType font contains an Encoding (* The font dictionary for a non-symbolic TrueType font contains an Encoding
dictionary which does not contain a BaseEncoding entry. *) dictionary which does not contain a BaseEncoding entry. *)
let matterhorn_31_020 pdf = let matterhorn_31_020 _ _ pdf =
Pdf.objiter Pdf.objiter
(fun _ o -> (fun _ o ->
match Pdf.lookup_direct pdf "/Subtype" o with match Pdf.lookup_direct pdf "/Subtype" o with
@ -961,7 +951,7 @@ let matterhorn_31_020 pdf =
(* The value for either the Encoding entry or the BaseEncoding entry in the (* The value for either the Encoding entry or the BaseEncoding entry in the
Encoding dictionary in a non-symbolic TrueType font dictionary is neither Encoding dictionary in a non-symbolic TrueType font dictionary is neither
MacRomanEncoding nor WinAnsiEncoding. *) MacRomanEncoding nor WinAnsiEncoding. *)
let matterhorn_31_021 pdf = let matterhorn_31_021 _ _ pdf =
Pdf.objiter Pdf.objiter
(fun _ o -> (fun _ o ->
match Pdf.lookup_direct pdf "/Subtype" o with match Pdf.lookup_direct pdf "/Subtype" o with
@ -983,7 +973,7 @@ let matterhorn_31_021 pdf =
(* The Differences array in the Encoding entry in a non-symbolic TrueType font (* The Differences array in the Encoding entry in a non-symbolic TrueType font
dictionary contains one or more glyph names which are not listed in the dictionary contains one or more glyph names which are not listed in the
Adobe Glyph List. *) Adobe Glyph List. *)
let matterhorn_31_022 pdf = let matterhorn_31_022 _ _ pdf =
Pdf.objiter Pdf.objiter
(fun _ o -> (fun _ o ->
match Pdf.lookup_direct pdf "/Subtype" o with match Pdf.lookup_direct pdf "/Subtype" o with
@ -1005,12 +995,12 @@ let matterhorn_31_022 pdf =
(* The Differences array is present in the Encoding entry in a non-symbolic (* The Differences array is present in the Encoding entry in a non-symbolic
TrueType font dictionary but the embedded font program does not contain a TrueType font dictionary but the embedded font program does not contain a
(3,1) Microsoft Unicode cmap. *) (3,1) Microsoft Unicode cmap. *)
let matterhorn_31_023 pdf = let matterhorn_31_023 _ _ pdf =
unimpl () unimpl ()
(* The Encoding entry is present in the font dictionary for a symbolic TrueType (* The Encoding entry is present in the font dictionary for a symbolic TrueType
font. *) font. *)
let matterhorn_31_024 pdf = let matterhorn_31_024 _ _ pdf =
Pdf.objiter Pdf.objiter
(fun _ o -> (fun _ o ->
match Pdf.lookup_direct pdf "/Subtype" o with match Pdf.lookup_direct pdf "/Subtype" o with
@ -1023,12 +1013,12 @@ let matterhorn_31_024 pdf =
pdf pdf
(* The embedded font program for a symbolic TrueType font contains no cmap. *) (* The embedded font program for a symbolic TrueType font contains no cmap. *)
let matterhorn_31_025 pdf = let matterhorn_31_025 _ _ pdf =
unimpl () unimpl ()
(* The embedded font program for a symbolic TrueType font contains more than (* The embedded font program for a symbolic TrueType font contains more than
one cmap, but none of the cmap entries is a (3,0) Microsoft Symbol cmap. *) one cmap, but none of the cmap entries is a (3,0) Microsoft Symbol cmap. *)
let matterhorn_31_026 pdf = let matterhorn_31_026 _ _ pdf =
unimpl () unimpl ()
(* A font dictionary does not contain the ToUnicode entry and none of the (* A font dictionary does not contain the ToUnicode entry and none of the
@ -1039,7 +1029,7 @@ let matterhorn_31_026 pdf =
Annex D; the font is a Type 0 font, and its descendant CIDFont uses Annex D; the font is a Type 0 font, and its descendant CIDFont uses
Adobe-GB1, Adobe-CNS1, Adobe-Japan1 or Adobe-Korea1 character collections; Adobe-GB1, Adobe-CNS1, Adobe-Japan1 or Adobe-Korea1 character collections;
the font is a non-symbolic TrueType font. *) the font is a non-symbolic TrueType font. *)
let matterhorn_31_027 pdf = let matterhorn_31_027 _ _ pdf =
not_fully_implemented (); not_fully_implemented ();
(* Here, we implement most of this one, but can't check the set of referenced (* Here, we implement most of this one, but can't check the set of referenced
glyphs for Type1 / Type3. *) glyphs for Type1 / Type3. *)
@ -1099,7 +1089,7 @@ let check_unicode tu n =
mem n (flatten (map (fun x -> Pdftext.codepoints_of_utf16be (snd x)) tu)) mem n (flatten (map (fun x -> Pdftext.codepoints_of_utf16be (snd x)) tu))
(* One or more Unicode values specified in the ToUnicode CMap are zero (0). *) (* One or more Unicode values specified in the ToUnicode CMap are zero (0). *)
let matterhorn_31_028 pdf = let matterhorn_31_028 _ _ pdf =
iter iter
(fun i -> (fun i ->
let tu = Pdftext.parse_tounicode pdf (Pdf.lookup_obj pdf i) in let tu = Pdftext.parse_tounicode pdf (Pdf.lookup_obj pdf i) in
@ -1108,7 +1098,7 @@ let matterhorn_31_028 pdf =
(* One or more Unicode values specified in the ToUnicode CMap are equal to (* One or more Unicode values specified in the ToUnicode CMap are equal to
either U+FEFF or U+FFFE. *) either U+FEFF or U+FFFE. *)
let matterhorn_31_029 pdf = let matterhorn_31_029 _ _ pdf =
iter iter
(fun i -> (fun i ->
let tu = Pdftext.parse_tounicode pdf (Pdf.lookup_obj pdf i) in let tu = Pdftext.parse_tounicode pdf (Pdf.lookup_obj pdf i) in
@ -1117,7 +1107,7 @@ let matterhorn_31_029 pdf =
(* One or more characters used in text showing operators reference the .notdef (* One or more characters used in text showing operators reference the .notdef
glyph. *) glyph. *)
let matterhorn_31_030 pdf = let matterhorn_31_030 _ _ pdf =
unimpl () unimpl ()
let matterhorn = let matterhorn =
@ -1211,9 +1201,11 @@ let matterhorn =
] ]
let test_matterhorn pdf = let test_matterhorn pdf =
let st2 = read_st2 pdf in
let st = st_of_st2 st2 in
option_map option_map
(fun (name, error, section, test) -> (fun (name, error, section, test) ->
try test pdf; None with try test st st2 pdf; None with
| MatterhornError extra -> Some (name, error, section, extra) | MatterhornError extra -> Some (name, error, section, extra)
| MatterhornUnimplemented -> None | MatterhornUnimplemented -> None
| e -> Some (name, "Incomplete", section, `String ("ERROR: " ^ Printexc.to_string e))) | e -> Some (name, "Incomplete", section, `String ("ERROR: " ^ Printexc.to_string e)))