2021-12-22 10:31:39 +01:00
|
|
|
open Pdfutil
|
|
|
|
open Pdfio
|
|
|
|
open Cpdferror
|
|
|
|
|
2021-12-31 20:33:34 +01:00
|
|
|
(* Blacken text *)
|
2021-12-22 10:31:39 +01:00
|
|
|
|
|
|
|
(*
|
|
|
|
Algorithm: Change
|
|
|
|
BT
|
|
|
|
<ops>
|
|
|
|
ET
|
|
|
|
|
|
|
|
...to...
|
|
|
|
|
|
|
|
BT
|
|
|
|
Op_g 0.
|
|
|
|
<ops minus any color, shading or gs operators>
|
|
|
|
ET
|
|
|
|
<ops minus any text positioning or text rendering ones>
|
|
|
|
*)
|
|
|
|
let blacktext_ops colour pdf resources content =
|
|
|
|
let not_text = function
|
|
|
|
| Pdfops.Op_Tj _ | Pdfops.Op_TJ _
|
|
|
|
| Pdfops.Op_' _ | Pdfops.Op_'' (_, _, _)
|
|
|
|
| Pdfops.Op_Td (_, _) | Pdfops.Op_TD (_, _)
|
|
|
|
| Pdfops.Op_Tm _ | Pdfops.Op_T'
|
|
|
|
| Pdfops.Op_Tc _
|
|
|
|
| Pdfops.Op_Tw _
|
|
|
|
| Pdfops.Op_Tz _
|
|
|
|
| Pdfops.Op_TL _
|
|
|
|
| Pdfops.Op_Tf (_, _)
|
|
|
|
| Pdfops.Op_Tr _
|
|
|
|
| Pdfops.Op_Ts _ -> false
|
|
|
|
| _ -> true
|
|
|
|
in let textlevel = ref 0
|
|
|
|
in let removed = ref []
|
|
|
|
in let operators =
|
|
|
|
Pdfops.parse_operators pdf resources content
|
|
|
|
in
|
|
|
|
let rec remove_colourops prev = function
|
|
|
|
| [] -> rev prev
|
|
|
|
| Pdfops.Op_BT::more ->
|
|
|
|
incr textlevel;
|
|
|
|
remove_colourops
|
|
|
|
(Cpdfaddtext.colour_op colour::Pdfops.Op_BT::prev)
|
|
|
|
more
|
|
|
|
| Pdfops.Op_ET::more ->
|
|
|
|
decr textlevel;
|
|
|
|
let prev' = !removed @ Pdfops.Op_ET::prev in
|
|
|
|
removed := [];
|
|
|
|
remove_colourops prev' more
|
|
|
|
| (Pdfops.Op_G _
|
|
|
|
| Pdfops.Op_g _
|
|
|
|
| Pdfops.Op_RG (_, _, _)
|
|
|
|
| Pdfops.Op_rg (_, _, _)
|
|
|
|
| Pdfops.Op_k (_, _, _, _)
|
|
|
|
| Pdfops.Op_K (_, _, _, _)
|
|
|
|
| Pdfops.Op_SCN _
|
|
|
|
| Pdfops.Op_SC _
|
|
|
|
| Pdfops.Op_scn _
|
|
|
|
| Pdfops.Op_sc _
|
|
|
|
| Pdfops.Op_SCNName (_, _)
|
|
|
|
| Pdfops.Op_scnName (_, _)
|
|
|
|
| Pdfops.Op_CS _
|
|
|
|
| Pdfops.Op_cs _
|
|
|
|
| Pdfops.Op_sh _
|
|
|
|
| Pdfops.Op_gs _)
|
|
|
|
as op::more ->
|
|
|
|
if !textlevel > 0
|
|
|
|
then
|
|
|
|
begin
|
|
|
|
removed =| op;
|
|
|
|
remove_colourops prev more
|
|
|
|
end
|
|
|
|
else remove_colourops (op::prev) more
|
|
|
|
| op::more ->
|
|
|
|
if !textlevel > 0 && not_text op then removed =| op;
|
|
|
|
remove_colourops (op::prev) more
|
|
|
|
in
|
|
|
|
let operators' = remove_colourops [] operators in
|
|
|
|
[Pdfops.stream_of_ops operators']
|
|
|
|
|
|
|
|
(* Blacken a form xobject, writing it to the same object. *)
|
|
|
|
|
|
|
|
let blacktext c range pdf =
|
|
|
|
let blacktext_page _ page =
|
|
|
|
let content' =
|
|
|
|
blacktext_ops c pdf page.Pdfpage.resources page.Pdfpage.content
|
|
|
|
in
|
2023-03-23 20:24:50 +01:00
|
|
|
Pdfpage.process_xobjects pdf page (blacktext_ops c);
|
2021-12-22 10:31:39 +01:00
|
|
|
{page with Pdfpage.content = content'}
|
|
|
|
in
|
2023-03-23 20:24:50 +01:00
|
|
|
Cpdfpage.process_pages (Pdfpage.ppstub blacktext_page) pdf range
|
2021-12-22 10:31:39 +01:00
|
|
|
|
2021-12-31 20:33:34 +01:00
|
|
|
(* Blacken lines *)
|
2021-12-22 10:31:39 +01:00
|
|
|
let blacklines_ops c pdf resources content =
|
|
|
|
let rec blacken_strokeops prev = function
|
|
|
|
| [] -> rev prev
|
|
|
|
| Pdfops.Op_CS _::t ->
|
|
|
|
blacken_strokeops (Pdfops.Op_CS "/DeviceRGB"::prev) t
|
|
|
|
| (Pdfops.Op_SC _ | Pdfops.Op_SCN _ | Pdfops.Op_SCNName _ | Pdfops.Op_G _
|
|
|
|
| Pdfops.Op_RG _ | Pdfops.Op_K _)::t ->
|
|
|
|
blacken_strokeops (Cpdfaddtext.colour_op_stroke c::prev) t
|
|
|
|
| h::t -> blacken_strokeops (h::prev) t
|
|
|
|
and operators =
|
|
|
|
Pdfops.parse_operators pdf resources content
|
|
|
|
in
|
|
|
|
let operators' = blacken_strokeops [] operators in
|
|
|
|
[Pdfops.stream_of_ops operators']
|
|
|
|
|
|
|
|
let blacklines c range pdf =
|
|
|
|
let blacklines_page _ page =
|
|
|
|
let content' =
|
|
|
|
blacklines_ops c pdf page.Pdfpage.resources page.Pdfpage.content
|
|
|
|
in
|
2023-03-23 20:24:50 +01:00
|
|
|
Pdfpage.process_xobjects pdf page (blacklines_ops c);
|
2021-12-22 10:31:39 +01:00
|
|
|
{page with Pdfpage.content = content'}
|
|
|
|
in
|
2023-03-23 20:24:50 +01:00
|
|
|
Cpdfpage.process_pages (Pdfpage.ppstub blacklines_page) pdf range
|
2021-12-22 10:31:39 +01:00
|
|
|
|
2021-12-31 20:33:34 +01:00
|
|
|
(* Blacken Fills *)
|
2021-12-22 10:31:39 +01:00
|
|
|
let blackfills_ops c pdf resources content =
|
|
|
|
let rec blacken_fillops prev = function
|
|
|
|
| [] -> rev prev
|
|
|
|
| Pdfops.Op_cs _::t ->
|
|
|
|
blacken_fillops (Pdfops.Op_cs "/DeviceRGB"::prev) t
|
|
|
|
| (Pdfops.Op_sc _ | Pdfops.Op_scn _ | Pdfops.Op_scnName _ | Pdfops.Op_g _
|
|
|
|
| Pdfops.Op_rg _ | Pdfops.Op_k _)::t ->
|
|
|
|
blacken_fillops (Cpdfaddtext.colour_op c::prev) t
|
|
|
|
| h::t -> blacken_fillops (h::prev) t
|
|
|
|
and operators =
|
|
|
|
Pdfops.parse_operators pdf resources content
|
|
|
|
in
|
|
|
|
let operators' = blacken_fillops [] operators in
|
|
|
|
[Pdfops.stream_of_ops operators']
|
|
|
|
|
|
|
|
let blackfills c range pdf =
|
|
|
|
let blackfills_page _ page =
|
|
|
|
let content' =
|
|
|
|
blackfills_ops c pdf page.Pdfpage.resources page.Pdfpage.content
|
|
|
|
in
|
2023-03-23 20:24:50 +01:00
|
|
|
Pdfpage.process_xobjects pdf page (blackfills_ops c);
|
2021-12-22 10:31:39 +01:00
|
|
|
{page with Pdfpage.content = content'}
|
|
|
|
in
|
2023-03-23 20:24:50 +01:00
|
|
|
Cpdfpage.process_pages (Pdfpage.ppstub blackfills_page) pdf range
|
2021-12-22 10:31:39 +01:00
|
|
|
|
2021-12-31 20:33:34 +01:00
|
|
|
(* Set a minimum line width to avoid dropout *)
|
2021-12-22 10:31:39 +01:00
|
|
|
let thinlines range width pdf =
|
|
|
|
let thinpage _ page =
|
|
|
|
let operators =
|
|
|
|
Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
|
|
|
|
in
|
|
|
|
let ctmstack = ref [ref Pdftransform.i_matrix] in
|
|
|
|
let scaleof_ctm () =
|
|
|
|
try
|
|
|
|
match Pdftransform.decompose (!(hd !ctmstack)) with
|
|
|
|
(scale, _, _, _, _, _) ->
|
|
|
|
scale
|
|
|
|
with
|
|
|
|
Failure _ (*"hd"*) -> 1.
|
|
|
|
in
|
|
|
|
let rec replace_operators prev = function
|
|
|
|
| [] -> rev prev
|
|
|
|
| (Pdfops.Op_w w)::more ->
|
|
|
|
(* Alter width. *)
|
|
|
|
let width' = width /. scaleof_ctm () in
|
|
|
|
let w' =
|
|
|
|
if w >= width' then Pdfops.Op_w w else Pdfops.Op_w width'
|
|
|
|
in
|
|
|
|
replace_operators (w'::prev) more
|
|
|
|
| (Pdfops.Op_cm m)::more ->
|
|
|
|
(* Update CTM *)
|
|
|
|
begin try
|
|
|
|
let top = hd !ctmstack in
|
|
|
|
top := Pdftransform.matrix_compose !top m
|
|
|
|
with
|
|
|
|
Failure _ (*"hd"*) -> error "Malformed file."
|
|
|
|
end;
|
|
|
|
replace_operators ((Pdfops.Op_cm m)::prev) more
|
|
|
|
| Pdfops.Op_q::more ->
|
|
|
|
(* Push stack *)
|
|
|
|
begin try
|
|
|
|
ctmstack =| ref (!(hd !ctmstack))
|
|
|
|
with
|
|
|
|
Failure _ (*"hd"*) -> error "Malformed file"
|
|
|
|
end;
|
|
|
|
replace_operators (Pdfops.Op_q::prev) more
|
|
|
|
| Pdfops.Op_Q::more ->
|
|
|
|
(* Pop stack *)
|
|
|
|
begin try
|
|
|
|
ctmstack := tl !ctmstack
|
|
|
|
with
|
|
|
|
Failure _ (*"tl"*) -> error "Malformed file"
|
|
|
|
end;
|
|
|
|
replace_operators (Pdfops.Op_Q::prev) more
|
|
|
|
| (Pdfops.Op_gs gsname)::more ->
|
|
|
|
(* Perhaps insert [Op_w]. *)
|
|
|
|
let opw =
|
|
|
|
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
|
|
|
|
| None -> []
|
|
|
|
| Some ext_state_dict ->
|
|
|
|
match Pdf.lookup_direct pdf gsname ext_state_dict with
|
|
|
|
| None -> []
|
|
|
|
| Some gdict ->
|
|
|
|
match Pdf.lookup_direct pdf "/LW" gdict with
|
2022-07-14 15:06:25 +02:00
|
|
|
| Some s -> (try [Pdfops.Op_w (Pdf.getnum pdf s)] with _ -> [])
|
2021-12-22 10:31:39 +01:00
|
|
|
| None -> []
|
|
|
|
in
|
|
|
|
replace_operators (opw @ ((Pdfops.Op_gs gsname)::prev)) more
|
|
|
|
| x::more -> replace_operators (x::prev) more
|
|
|
|
in
|
|
|
|
let operators = replace_operators [] operators in
|
|
|
|
(* 2. Add an initial 'w' if width more than default width *)
|
|
|
|
let operators =
|
|
|
|
if width > 1. then (Pdfops.Op_w width)::operators else operators
|
|
|
|
in
|
|
|
|
let content' = [Pdfops.stream_of_ops operators] in
|
|
|
|
{page with Pdfpage.content = content'}
|
|
|
|
in
|
2023-03-23 20:24:50 +01:00
|
|
|
Cpdfpage.process_pages (Pdfpage.ppstub thinpage) pdf range
|
2021-12-22 10:31:39 +01:00
|
|
|
|
|
|
|
(* Parse the new content to make sure syntactically ok, append
|
|
|
|
* as required. Rewrite the content *)
|
|
|
|
let append_page_content_page fast s before pdf n page =
|
|
|
|
let ops =
|
|
|
|
Pdfops.parse_stream pdf page.Pdfpage.resources [bytes_of_string s]
|
|
|
|
in
|
|
|
|
(if before then Pdfpage.prepend_operators else Pdfpage.postpend_operators)
|
|
|
|
pdf ops ~fast page
|
|
|
|
|
|
|
|
let append_page_content s before fast range pdf =
|
2023-03-23 20:24:50 +01:00
|
|
|
Cpdfpage.process_pages (Pdfpage.ppstub (append_page_content_page fast s before pdf)) pdf range
|
2021-12-22 10:31:39 +01:00
|
|
|
|
|
|
|
let rec dict_entry_single_object f pdf = function
|
|
|
|
| (Pdf.Dictionary d) -> f (Pdf.recurse_dict (dict_entry_single_object f pdf) d)
|
|
|
|
| (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) ->
|
|
|
|
f (Pdf.Stream {contents = (Pdf.recurse_dict (dict_entry_single_object f pdf) dict, data)})
|
|
|
|
| Pdf.Array a -> Pdf.recurse_array (dict_entry_single_object f pdf) a
|
|
|
|
| x -> x
|
|
|
|
|
|
|
|
(* FIXME are we sure that functional values can never appear in the equality here? *)
|
|
|
|
let remove_dict_entry pdf key search =
|
|
|
|
let f d =
|
|
|
|
match search with
|
|
|
|
| None -> Pdf.remove_dict_entry d key
|
|
|
|
| Some s ->
|
|
|
|
match Pdf.lookup_direct pdf key d with
|
|
|
|
| Some v when v = s -> Pdf.remove_dict_entry d key
|
|
|
|
| _ -> d
|
|
|
|
in
|
|
|
|
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
|
|
|
|
pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict
|
|
|
|
|
|
|
|
let replace_dict_entry pdf key value search =
|
|
|
|
let f d =
|
|
|
|
match search with
|
2022-01-19 12:41:45 +01:00
|
|
|
| None -> begin try Pdf.replace_dict_entry d key value with _ -> d end
|
2021-12-22 10:31:39 +01:00
|
|
|
| Some s ->
|
|
|
|
match Pdf.lookup_direct pdf key d with
|
|
|
|
| Some v when v = s -> Pdf.replace_dict_entry d key value
|
|
|
|
| _ -> d
|
|
|
|
in
|
|
|
|
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
|
|
|
|
pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict
|
|
|
|
|
|
|
|
(* FIXME no need to self map here, since nothing changes *)
|
2023-01-17 06:37:54 +01:00
|
|
|
let print_dict_entry ~utf8 pdf key =
|
2021-12-22 10:31:39 +01:00
|
|
|
let f d =
|
|
|
|
match Pdf.lookup_direct pdf key d with
|
|
|
|
| Some v ->
|
|
|
|
(* We use a double newline as a separator. *)
|
2023-01-17 06:37:54 +01:00
|
|
|
Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object ~utf8 ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v));
|
2021-12-22 10:31:39 +01:00
|
|
|
d
|
|
|
|
| None -> d
|
|
|
|
in
|
|
|
|
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
|
|
|
|
pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict
|
|
|
|
|
2022-01-18 17:49:06 +01:00
|
|
|
(* For cpdflib. *)
|
2023-01-17 06:37:54 +01:00
|
|
|
let get_dict_entries ~utf8 pdf key =
|
2022-01-18 17:49:06 +01:00
|
|
|
let es = ref [] in
|
|
|
|
let f d =
|
|
|
|
match Pdf.lookup_direct pdf key d with
|
2023-01-17 06:37:54 +01:00
|
|
|
| Some v -> es := Cpdfjson.json_of_object ~utf8 ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false v::!es; d
|
2022-01-18 17:49:06 +01:00
|
|
|
| None -> d
|
|
|
|
in
|
|
|
|
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
|
|
|
|
pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict;
|
|
|
|
let arr = `List (rev !es) in
|
|
|
|
(Pdfio.bytes_of_string (Cpdfyojson.Safe.to_string arr))
|
|
|
|
|
2021-12-22 10:31:39 +01:00
|
|
|
let remove_clipping_ops pdf resources content =
|
|
|
|
let ops = Pdfops.parse_operators pdf resources content in
|
|
|
|
let rec process a = function
|
|
|
|
Pdfops.Op_W::Pdfops.Op_n::t -> process (Pdfops.Op_n::a) t
|
|
|
|
| h::t -> process (h::a) t
|
|
|
|
| [] -> rev a
|
|
|
|
in
|
|
|
|
[Pdfops.stream_of_ops (process [] ops)]
|
|
|
|
|
|
|
|
let remove_clipping pdf range =
|
|
|
|
let remove_clipping_page _ page =
|
|
|
|
let content' =
|
|
|
|
remove_clipping_ops pdf page.Pdfpage.resources page.Pdfpage.content
|
|
|
|
in
|
2023-03-23 20:24:50 +01:00
|
|
|
Pdfpage.process_xobjects pdf page remove_clipping_ops;
|
2021-12-22 10:31:39 +01:00
|
|
|
{page with Pdfpage.content = content'}
|
|
|
|
in
|
2023-03-23 20:24:50 +01:00
|
|
|
Cpdfpage.process_pages (Pdfpage.ppstub remove_clipping_page) pdf range
|