diff --git a/lib/pdf/reader/buffer.rb b/lib/pdf/reader/buffer.rb index 69062ee4..8fe4fc95 100644 --- a/lib/pdf/reader/buffer.rb +++ b/lib/pdf/reader/buffer.rb @@ -152,10 +152,30 @@ def token #: () -> Integer def find_first_xref_offset check_size_is_non_zero - @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0) - data = @io.read(TRAILING_BYTECOUNT) - raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil? + # Skip trailing null bytes to find the effective end of the PDF. + # Some generators (e.g. Atos/Fonet) append thousands of null bytes + # after %%EOF. Scan backwards in chunks so a file padded with a huge + # run of nulls doesn't turn into a per-byte seek+read loop. + @io.seek(0, IO::SEEK_END) + end_pos = @io.pos + + while end_pos > 0 + chunk_size = [TRAILING_BYTECOUNT, end_pos].min + @io.seek(end_pos - chunk_size) + chunk = @io.read(chunk_size) + if chunk && (idx = chunk.rindex(/[^\x00]/)) + end_pos = end_pos - chunk_size + idx + 1 + break + end + end_pos -= chunk_size + end + + start_pos = [end_pos - TRAILING_BYTECOUNT, 0].max + @io.seek(start_pos) + data = @io.read(end_pos - start_pos) + + raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil? || data.empty? # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both. lines = data.split(/[\n\r]+/).reverse diff --git a/spec/data/trailing_null_bytes.pdf b/spec/data/trailing_null_bytes.pdf new file mode 100644 index 00000000..ad3f0424 Binary files /dev/null and b/spec/data/trailing_null_bytes.pdf differ diff --git a/spec/integrity.yml b/spec/integrity.yml index 957652c7..f71414d1 100644 --- a/spec/integrity.yml +++ b/spec/integrity.yml @@ -449,6 +449,9 @@ data/tounicode-wrong-type-indirect.pdf: data/tounicode-wrong-type.pdf: :bytes: 12219 :md5: 0be721e975bc8ec21eae829e0cddc1af +data/trailing_null_bytes.pdf: + :bytes: 67721 + :md5: 6b9b6375ca194fdf8687a44130c54df2 data/truetype-arial.pdf: :bytes: 1387 :md5: 2b3e4ff85b618d1f4c6b3b5df2631ab0 diff --git a/spec/reader/buffer_spec.rb b/spec/reader/buffer_spec.rb index 9e730efc..cfe340db 100644 --- a/spec/reader/buffer_spec.rb +++ b/spec/reader/buffer_spec.rb @@ -631,6 +631,24 @@ expect(buffer.find_first_xref_offset).to eql(145) end end + + context "trailing_null_bytes.pdf (null bytes after the EOF marker)" do + it "finds the first xref offset" do + file = File.new pdf_spec_file("trailing_null_bytes") + buffer = PDF::Reader::Buffer.new file + + expect(buffer.find_first_xref_offset).to eql(145) + end + end + + context "when the file is nothing but null bytes" do + it "raises a MalformedPDFError" do + io = StringIO.new("\x00" * 10_000) + buffer = PDF::Reader::Buffer.new(io) + + expect { buffer.find_first_xref_offset }.to raise_error(PDF::Reader::MalformedPDFError) + end + end end describe PDF::Reader::Buffer, "read method" do