mirror of
				https://github.com/johnwhitington/cpdf-source.git
				synced 2025-06-05 22:09:39 +02:00 
			
		
		
		
	Avoid squeezing shared content streams
This commit is contained in:
		
							
								
								
									
										44
									
								
								cpdf.ml
									
									
									
									
									
								
							
							
						
						
									
										44
									
								
								cpdf.ml
									
									
									
									
									
								
							| @@ -109,9 +109,13 @@ let really_squeeze pdf = | |||||||
|           pdf.Pdf.objects <- !pdfr.Pdf.objects; |           pdf.Pdf.objects <- !pdfr.Pdf.objects; | ||||||
|           pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict |           pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict | ||||||
|  |  | ||||||
| (* Squeeze the form xobject at objnum. FIXME: For old PDFs (< v1.2) any | (* Squeeze the form xobject at objnum. | ||||||
| resources from the page (or its ancestors in the page tree!) are also needed - |  | ||||||
| we must merge them with the ones from the xobject itself. *) | FIXME: For old PDFs (< v1.2) any resources from the page (or its ancestors in | ||||||
|  | the page tree!) are also needed - we must merge them with the ones from the | ||||||
|  | xobject itself. However, it it safe for now -- in the unlikely event that the | ||||||
|  | resources actually need to be available, the parse will fail, the squeeze of | ||||||
|  | this object will fail, and we bail out. *) | ||||||
| let xobjects_done = ref [] | let xobjects_done = ref [] | ||||||
|  |  | ||||||
| let squeeze_form_xobject pdf objnum = | let squeeze_form_xobject pdf objnum = | ||||||
| @@ -142,9 +146,36 @@ let squeeze_form_xobject pdf objnum = | |||||||
|             end |             end | ||||||
|       | _ -> () |       | _ -> () | ||||||
|  |  | ||||||
|  | (* For a list of indirects representing content streams, make sure that none of | ||||||
|  | them are duplicated in the PDF. This indicates sharing, which parsing and | ||||||
|  | rewriting the streams might destroy, thus making the file bigger. FIXME: The | ||||||
|  | correct thing to do is to preserve the multiple content streams. *) | ||||||
|  | let no_duplicates content_stream_numbers stream_numbers = | ||||||
|  |   not | ||||||
|  |     (mem false | ||||||
|  |        (map | ||||||
|  |          (fun n -> length (keep (eq n) content_stream_numbers) < 2) | ||||||
|  |          stream_numbers)) | ||||||
|  |  | ||||||
|  | (* Give a list of content stream numbers, given a page reference number *) | ||||||
|  | let content_streams_of_page pdf refnum = | ||||||
|  |   match Pdf.direct pdf (Pdf.lookup_obj pdf refnum) with | ||||||
|  |     Pdf.Dictionary dict -> | ||||||
|  |       begin match lookup "/Contents" dict with | ||||||
|  |         Some (Pdf.Indirect i) -> [i] | ||||||
|  |       | Some (Pdf.Array x) -> | ||||||
|  |           option_map (function Pdf.Indirect i -> Some i | _ -> None) x | ||||||
|  |       | _ -> [] | ||||||
|  |       end | ||||||
|  |   | _ -> [] | ||||||
|  |  | ||||||
| (* For each object in the PDF marked with /Type /Page, for each /Contents | (* For each object in the PDF marked with /Type /Page, for each /Contents | ||||||
| indirect reference or array of such, decode and recode that content stream. *) | indirect reference or array of such, decode and recode that content stream. *) | ||||||
| let squeeze_all_content_streams pdf = | let squeeze_all_content_streams pdf = | ||||||
|  |   let page_reference_numbers = Pdf.page_reference_numbers pdf in | ||||||
|  |     let all_content_streams_in_doc = | ||||||
|  |       flatten (map (content_streams_of_page pdf) page_reference_numbers) | ||||||
|  |     in | ||||||
|       xobjects_done := []; |       xobjects_done := []; | ||||||
|       Pdf.objiter |       Pdf.objiter | ||||||
|         (fun objnum _ -> |         (fun objnum _ -> | ||||||
| @@ -159,7 +190,6 @@ let squeeze_all_content_streams pdf = | |||||||
|                   | None -> Pdf.Dictionary [] |                   | None -> Pdf.Dictionary [] | ||||||
|                 in |                 in | ||||||
|                   begin try |                   begin try | ||||||
|                 let newstream = |  | ||||||
|                     let content_streams = |                     let content_streams = | ||||||
|                       match lookup "/Contents" dict with |                       match lookup "/Contents" dict with | ||||||
|                         Some (Pdf.Indirect i) -> |                         Some (Pdf.Indirect i) -> | ||||||
| @@ -170,6 +200,12 @@ let squeeze_all_content_streams pdf = | |||||||
|                       | Some (Pdf.Array x) -> x |                       | Some (Pdf.Array x) -> x | ||||||
|                       | _ -> raise Not_found |                       | _ -> raise Not_found | ||||||
|                     in |                     in | ||||||
|  |                       if | ||||||
|  |                         no_duplicates | ||||||
|  |                           all_content_streams_in_doc | ||||||
|  |                           (map (function Pdf.Indirect i -> i | _ -> assert false) content_streams) | ||||||
|  |                       then | ||||||
|  |                         let newstream = | ||||||
|                           Pdfops.stream_of_ops |                           Pdfops.stream_of_ops | ||||||
|                             (Pdfops.parse_operators pdf resources content_streams) |                             (Pdfops.parse_operators pdf resources content_streams) | ||||||
|                         in |                         in | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user