From facb9d3c754984a2e8720a21ce13e6ed9a858db4 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Tue, 2 Jul 2024 16:16:11 +0100 Subject: [PATCH] Fixed headings checks --- cpdfua.ml | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/cpdfua.ml b/cpdfua.ml index 49792b9..ec66efa 100644 --- a/cpdfua.ml +++ b/cpdfua.ml @@ -5,7 +5,7 @@ open Cpdferror a) Those which require looking deep inside font files; and b) Those which require reading inside the graphics stream. -Experimental. Both false positive and false negativce results possible. *) +Experimental. Both false positive and false negative results possible. *) exception MatterhornError of Cpdfyojson.Safe.t @@ -422,27 +422,28 @@ let is_hnum s = begin try ignore (int_of_string (implode cs)); true with Failure _ -> false end | _ -> false +let num s = int_of_string (implode (tl (tl (explode s)))) + +let rec headings_list_of_tree (E (n, cs)) = + (if is_hnum n then [n] else []) @ flatten (map headings_list_of_tree cs) + (* Does use numbered headings, but the first heading tag is not

. *) -let matterhorn_14_002 st st2 pdf = - let rec check_hn = function - | E ("/H1", cs) -> () - | E (s, cs) when is_hnum s -> merror () - | E (_, cs) -> iter check_hn cs - in - check_hn st +let matterhorn_14_002 st _ _ = + match headings_list_of_tree st with + | [] | "/H1"::_ -> () + | _ -> merror () (* Numbered heading levels in descending sequence are skipped (Example:

follows directly after

). *) -let matterhorn_14_003 st st2 pdf = - let rec check_nseq n = function - | E (s, cs) when is_hnum s -> - let num = int_of_string (implode (tl (tl (explode s)))) in - if num > n + 1 && n > 0 then merror_str (Printf.sprintf "%i -> %i" n num); - iter (check_nseq num) cs - | E (_, cs) -> iter (check_nseq n) cs +let matterhorn_14_003 st _ _ = + let rec check l = function + | [] -> () + | n::ns -> + let nm = num n in + if nm > l + 1 then merror_str (Printf.sprintf "%i -> %i" l nm) else check nm ns in - check_nseq 0 st - + check 1 (headings_list_of_tree st) + (* A node contains more than one tag. *) let matterhorn_14_006 st st2 pdf = let found = ref false in