d16de16 · Oversight

Improve Rust PDF text extraction

Co-authored-by: Codex (GPT-5.4) <noreply@openai.com>

d16de16 Zion Boggan committed on May 18, 2026 (1 month ago)

CHANGELOG.md +5 -0

		@@ -39,6 +39,11 @@
		- Source comment guard. Added `scripts/check_source_comments.py`, a pytest
		wrapper, and a `source-style` GitHub Actions workflow so strict comment-light
		paths fail CI if prose comments are reintroduced.
	+	- Rust PDF extraction parity. `oversight-formats` now uses lopdf page text
	+	extraction plus parsed content-stream operations for PDF fingerprint text
	+	instead of raw literal scanning. The fallback handles `Tj`, `TJ`, quote
	+	operators, and array spacing, with new Rust tests covering page-level PDF
	+	text extraction.

		## v0.4.11 - 2026-05-08 Hardware-keys completion: Python parity, browser support, end-to-end seal

README.md +4 -3

		@@ -326,7 +326,8 @@ These items are included in v0.4.4/v0.4.5 and current `main`:
		- Rust registry and format-adapter paths now mirror the Python hardening:
		authenticated DNS beacon callbacks, no silent signed-artifact drops,
		digest-checked Rekor offline verification, fail-closed Rust `max_opens`,
	-	DOCX keyword insertion, and PDF action screening.
	+	DOCX keyword insertion, PDF action screening, and parsed PDF text
	+	extraction for fingerprinting.
		- L3 semantic watermarking is opt-in for sensitive classes, requires
		disclosure acknowledgement when enabled, and records `canonical_content_hash`.
		- `.sealed` parsing rejects suite-byte tamper, malformed manifest or wrapped-DEK
		@@ -420,7 +421,7 @@ project does not backport fixes below the current stable line.
		\| Python pytest suite \| 10 \| green \|
		\| Rust oversight-container \| 17 \| green \|
		\| Rust oversight-crypto \| 21 \| green \|
	-	\| Rust oversight-formats \| 35 \| green \|
	+	\| Rust oversight-formats \| 37 \| green \|
		\| Rust oversight-manifest \| 3 \| green \|
		\| Rust oversight-policy \| 7 \| green \|
		\| Rust oversight-registry \| 8 \| green \|
		@@ -429,7 +430,7 @@ project does not backport fixes below the current stable line.
		\| Rust oversight-tlog \| 7 \| green \|
		\| Rust oversight-watermark \| 4 \| green \|
		\| Cross-language conformance \| 3 \| green \|
	-	\| Total automated Rust unit tests \| 120 \| all green \|
	+	\| Total automated Rust unit tests \| 122 \| all green \|

		## Design principles (what Oversight never does)

docs/ROADMAP.md +4 -1

		@@ -72,7 +72,10 @@ The seal, open, manifest, policy, semantic, tlog, and rekor path is
		implemented as a Rust workspace under `oversight-rust/`. `cargo build
		--workspace --release` passes with zero warnings. Python is the reference
		implementation; Rust is canonical for production deployments. A conformance
	-	suite proves bit-identical output for every manifest and envelope.
	+	suite proves bit-identical output for every manifest and envelope. Format
	+	adapter parity is being closed in bounded slices; current `main` has parsed
	+	PDF page/content-stream text extraction for fingerprinting instead of raw
	+	literal scanning.

		### Fail-closed security hardening - v0.4.4

oversight-rust/oversight-formats/src/pdf.rs +192 -32

		@@ -23,7 +23,8 @@
		//! might inject unwanted objects).

		use crate::{FormatAdapter, FormatError, WatermarkCandidate};
	-	use lopdf::{Dictionary, Document, Object, StringFormat};
	+	use lopdf::content::{Content, Operation};
	+	use lopdf::{decode_text_string, Dictionary, Document, Object, StringFormat};

		/// PDF `/Info` dictionary key for the oversight mark_id.
		const METADATA_KEY: &str = "OversightMark";
		@@ -172,34 +173,28 @@ pub fn extract_pdf_metadata(pdf_bytes: &[u8]) -> Result<PdfOversightMeta, Format
		Ok(meta)
		}

	-	/// Extract all text content from the PDF for fingerprinting and downstream
	-	/// L1/L2/L3 watermark recovery.
	-	///
	-	/// TODO: Implement full text extraction using lopdf's content stream parsing.
	-	/// For now, this extracts raw string objects from the PDF which captures
	-	/// most text but may miss some layout-dependent content.
		pub fn extract_text_for_fingerprint(pdf_bytes: &[u8]) -> Result<String, FormatError> {
		let doc = Document::load_mem(pdf_bytes)
		.map_err(\|e\| FormatError::Malformed(format!("PDF parse error: {}", e)))?;

	-	let mut text_parts: Vec<String> = Vec::new();
	+	let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
	+	if !page_numbers.is_empty() {
	+	if let Ok(text) = doc.extract_text(&page_numbers) {
	+	let normalized = normalize_pdf_text_parts(text.lines().map(str::to_owned));
	+	if !normalized.is_empty() {
	+	return Ok(normalized);
	+	}
	+	}
	+	}

	-	// Iterate all pages and extract text from content streams
	+	let mut text_parts: Vec<String> = Vec::new();
		for page_id in doc.page_iter() {
		if let Ok(content) = doc.get_page_content(page_id) {
	-	// The content stream is raw bytes; extract text between Tj/TJ operators
	-	// This is a simplified extraction -- full implementation would parse
	-	// the content stream operators properly.
	-	if let Ok(text) = String::from_utf8(content.clone()) {
	-	// Extract strings from Tj and TJ operators (simplified)
	-	for part in extract_text_from_content_stream(&text) {
	-	text_parts.push(part);
	-	}
	-	}
	+	text_parts.extend(extract_text_from_content_stream(&content));
		}
		}

	-	Ok(text_parts.join("\n"))
	+	Ok(normalize_pdf_text_parts(text_parts))
		}

		// ---------------------------------------------------------------------------
		@@ -298,20 +293,88 @@ fn get_string_from_dict(dict: &Dictionary, key: &str) -> Option<String> {
		})
		}

	-	/// Simplified text extraction from a PDF content stream.
	-	///
	-	/// Looks for `(text) Tj` and `[(text)] TJ` patterns. This is a best-effort
	-	/// extraction; a complete implementation would use a proper PDF content
	-	/// stream parser.
	-	///
	-	/// TODO: Replace with a proper content stream parser for production use.
	-	fn extract_text_from_content_stream(content: &str) -> Vec<String> {
	+	fn extract_text_from_content_stream(content: &[u8]) -> Vec<String> {
	+	if let Ok(parsed) = Content::decode(content) {
	+	return extract_text_from_operations(&parsed.operations);
	+	}
	+	extract_text_from_literal_bytes(content)
	+	}
	+
	+	fn extract_text_from_operations(operations: &[Operation]) -> Vec<String> {
	+	let mut parts = Vec::new();
	+	for operation in operations {
	+	match operation.operator.as_str() {
	+	"Tj" \| "'" => {
	+	if let Some(text) = operation.operands.first().and_then(object_text) {
	+	push_pdf_text(&mut parts, text);
	+	}
	+	}
	+	"\"" => {
	+	if let Some(text) = operation.operands.last().and_then(object_text) {
	+	push_pdf_text(&mut parts, text);
	+	}
	+	}
	+	"TJ" => {
	+	if let Some(Object::Array(items)) = operation.operands.first() {
	+	push_pdf_text(&mut parts, text_from_array(items));
	+	}
	+	}
	+	_ => {}
	+	}
	+	}
	+	parts
	+	}
	+
	+	fn object_text(obj: &Object) -> Option<String> {
	+	match obj {
	+	Object::String(_, _) => decode_text_string(obj).ok(),
	+	_ => None,
	+	}
	+	}
	+
	+	fn text_from_array(items: &[Object]) -> String {
	+	let mut text = String::new();
	+	for item in items {
	+	match item {
	+	Object::String(_, _) => {
	+	if let Some(part) = object_text(item) {
	+	text.push_str(&part);
	+	}
	+	}
	+	Object::Integer(value) if *value < -100 => text.push(' '),
	+	Object::Real(value) if *value < -100.0 => text.push(' '),
	+	_ => {}
	+	}
	+	}
	+	text
	+	}
	+
	+	fn push_pdf_text(parts: &mut Vec<String>, text: String) {
	+	let trimmed = text.trim();
	+	if !trimmed.is_empty() {
	+	parts.push(trimmed.to_owned());
	+	}
	+	}
	+
	+	fn normalize_pdf_text_parts<I>(parts: I) -> String
	+	where
	+	I: IntoIterator<Item = String>,
	+	{
	+	parts
	+	.into_iter()
	+	.map(\|part\| part.trim().to_owned())
	+	.filter(\|part\| !part.is_empty())
	+	.collect::<Vec<_>>()
	+	.join("\n")
	+	}
	+
	+	fn extract_text_from_literal_bytes(content: &[u8]) -> Vec<String> {
		let mut parts = Vec::new();
		let mut i = 0;
	-	let chars: Vec<char> = content.chars().collect();
	+	let lossy = String::from_utf8_lossy(content);
	+	let chars: Vec<char> = lossy.chars().collect();
		while i < chars.len() {
		if chars[i] == '(' {
	-	// Find matching closing paren (handle nesting)
		let mut depth = 1;
		let mut j = i + 1;
		while j < chars.len() && depth > 0 {
		@@ -341,6 +404,7 @@ fn extract_text_from_content_stream(content: &str) -> Vec<String> {
		#[cfg(test)]
		mod tests {
		use super::*;
	+	use lopdf::{dictionary, Stream};

		#[test]
		fn pdf_adapter_can_handle() {
		@@ -387,7 +451,103 @@ mod tests {
		assert!(security_check(&doc).is_err());
		}

	-	// Note: Full embed/extract round-trip tests require a valid PDF file.
	-	// These are integration tests that should be run with test fixtures.
	-	// The unit tests above verify the adapter's detection and sanitization logic.
	+	#[test]
	+	fn extract_text_from_content_stream_parses_pdf_text_ops() {
	+	let content = Content {
	+	operations: vec![
	+	Operation::new("BT", vec![]),
	+	Operation::new("Tj", vec![Object::string_literal("Hello")]),
	+	Operation::new(
	+	"TJ",
	+	vec![Object::Array(vec![
	+	Object::string_literal("Rust"),
	+	Object::Integer(-120),
	+	Object::string_literal("Port"),
	+	])],
	+	),
	+	Operation::new("'", vec![Object::string_literal("Next line")]),
	+	Operation::new(
	+	"\"",
	+	vec![
	+	Object::Integer(0),
	+	Object::Integer(0),
	+	Object::string_literal("Quoted op"),
	+	],
	+	),
	+	Operation::new("ET", vec![]),
	+	],
	+	};
	+	let encoded = content.encode().unwrap();
	+	let parts = extract_text_from_content_stream(&encoded);
	+	assert_eq!(parts, vec!["Hello", "Rust Port", "Next line", "Quoted op"]);
	+	}
	+
	+	#[test]
	+	fn extract_text_for_fingerprint_reads_page_text() {
	+	let content = Content {
	+	operations: vec![
	+	Operation::new("BT", vec![]),
	+	Operation::new("Tf", vec!["F1".into(), Object::Integer(12)]),
	+	Operation::new("Td", vec![Object::Integer(72), Object::Integer(720)]),
	+	Operation::new("Tj", vec![Object::string_literal("Oversight Rust")]),
	+	Operation::new(
	+	"TJ",
	+	vec![Object::Array(vec![
	+	Object::string_literal("PDF"),
	+	Object::Integer(-120),
	+	Object::string_literal("Extraction"),
	+	])],
	+	),
	+	Operation::new("ET", vec![]),
	+	],
	+	};
	+	let pdf = minimal_pdf(content);
	+	let text = extract_text_for_fingerprint(&pdf).unwrap();
	+	assert!(text.contains("Oversight Rust"));
	+	assert!(text.contains("PDF Extraction"));
	+	}
	+
	+	fn minimal_pdf(content: Content<Vec<Operation>>) -> Vec<u8> {
	+	let mut doc = Document::with_version("1.5");
	+	let pages_id = doc.new_object_id();
	+	let font_id = doc.add_object(dictionary! {
	+	"Type" => "Font",
	+	"Subtype" => "Type1",
	+	"BaseFont" => "Helvetica",
	+	});
	+	let resources_id = doc.add_object(dictionary! {
	+	"Font" => dictionary! {
	+	"F1" => font_id,
	+	},
	+	});
	+	let content_id = doc.add_object(Stream::new(dictionary! {}, content.encode().unwrap()));
	+	let page_id = doc.add_object(dictionary! {
	+	"Type" => "Page",
	+	"Parent" => pages_id,
	+	"Contents" => content_id,
	+	});
	+	doc.objects.insert(
	+	pages_id,
	+	Object::Dictionary(dictionary! {
	+	"Type" => "Pages",
	+	"Kids" => vec![page_id.into()],
	+	"Count" => 1,
	+	"Resources" => resources_id,
	+	"MediaBox" => vec![
	+	Object::Integer(0),
	+	Object::Integer(0),
	+	Object::Integer(595),
	+	Object::Integer(842),
	+	],
	+	}),
	+	);
	+	let catalog_id = doc.add_object(dictionary! {
	+	"Type" => "Catalog",
	+	"Pages" => pages_id,
	+	});
	+	doc.trailer.set("Root", catalog_id);
	+	let mut output = Vec::new();
	+	doc.save_to(&mut output).unwrap();
	+	output
	+	}
		}