PDF Page Count Algorithm
December 2004
It's not often that you can't find coding examples on the internet to do basic programming tasks, so I was surprised when I couldn't find code to determine the page count of PDF documents.
I've had to return to my old hacking ways (ie 'hacking' in its original sense). I visited my favorite hacking website - www.wotsit.org, downloaded the PDF document specifications and got coding.
It didn't take too long to construct a reasonably efficient algorithm which does what I want. I've thrown a few PDFs at it over the last couple of days so I hope I've squashed most of the bugs.
Update 22-Dec-2005:
Yes, I did find a couple of minor bugs which have now been corrected.
Code snippet ...
type
PPdfObj = ^TPdfObj;
TPdfObj = record
number,
offset: integer;
end;
function GetPdfPageCount(const filename: string): integer;
var
ms: TMemoryStream;
k, cnt, pagesNum, rootNum: integer;
p, p2: pchar;
PdfObj: PPdfObj;
PdfObjList: TList;
//Summary of steps taken to parse PDF file for page count :-
//1. Locate 'startxref' at end of file
//2. get 'xref' offset and go to xref table
//3. fill my pdfObj List with object numbers and offsets
//4. handle subsections within xref table.
//5. read 'trailer' section at end of each xref
//6. store 'Root' object number if found in 'trailer'
//7. if 'Prev' xref found in 'trailer' - loop back to step 2
//8. locate Root in my full pdfObj List
//9. locate 'Pages' object from Root
//10. get Count from Pages.
function GetNumber(out num: integer): boolean;
var
tmpStr: string;
begin
tmpStr := '';
while p^ < #33 do inc(p); //skip leading CR,LF & SPC
while (p^ in ['0'..'9']) do
begin
tmpStr := tmpStr + p^;
inc(p);
end;
result := tmpStr <> '';
if not result then exit;
num := strtoint(tmpStr);
end;
function IsString(const str: string): boolean;
var
len: integer;
begin
len := length(str);
result := CompareMem( p, pchar(str), len);
inc(p, len);
end;
function FindStrInDict(const str: string): boolean;
begin
//PDF 'dictionaries' (assoc. arrays) terminate with '>>'
result := false;
while not result do
begin
while (p^ <> '>') and (p^ <> str[1]) do inc(p);
if (p^ = '>') then
begin
inc(p);
if (p^ = '>') then exit else continue;
end;
result := IsString(str);
end;
end;
begin
//on error return -1 as page count
result := -1;
try
ms := TMemoryStream.Create;
PdfObjList := TList.Create;
screen.Cursor := crHourGlass;
application.ProcessMessages;
try
ms.LoadFromFile(filename);
//find 'startxref' ignoring '%%EOF'
p := pchar(ms.Memory) + ms.Size -5;
//21-Jun-05: bugfix
//sometimes rubbish is appended to the pdf so
//look deeper for 'startxref'
p2 := pchar(ms.Memory);
repeat
while (p > p2) and (p^ <> 'f') do dec(p);
if (p = p2) then exit;
if StrLComp( (p-8), 'startxref', 9) = 0 then
break;
dec(p);
until false;
inc(p);
rootNum := -1; //ie flag not yet found
//xref offset ==> k
if not GetNumber(k) then exit;
p := pchar(ms.Memory) + k +4;
while true do //top of loop //////////////////////////////
begin
//get base object number ==> k
if not GetNumber(k) then exit;
//get object count ==> cnt
if not GetNumber(cnt) then exit;
while not (p^ in ['0'..'9']) do inc(p); //skip CR, LF
p2 := p;
//add all objects in section to list ...
for cnt := 0 to cnt-1 do
begin
new(PdfObj);
PdfObjList.Add(PdfObj);
PdfObj.number := k + cnt;
if not GetNumber(PdfObj.offset) then exit;
inc(p2,20);
p := p2;
end;
//check for and process further subsections ...
if p^ in ['0'..'9'] then continue;
// parse 'trailer dictionary' ...
if not IsString('trailer') then exit;
p2 := p;
// get Root (aka /Catalog) ...
if (rootNum = -1) and FindStrInDict('/Root') then
if not GetNumber(rootNum) then exit;
p := p2;
if not FindStrInDict('/Prev') then
break; //no more xrefs
//next xref offset ==> k
if not GetNumber(k) then exit;
p := pchar(ms.Memory) + k +4;
end; //bottom of loop /////////////////////////////////////
//Make sure we've got Root the object number ...
if rootNum < 0 then exit;
//Find Root object in list and go to its offset ...
k := 0;
while k < PdfObjList.Count do
if PPdfObj(PdfObjList[k]).number = rootNum then
break else
inc(k);
if k = PdfObjList.Count then exit;
p := pchar(ms.Memory) + PPdfObj(PdfObjList[k]).offset;
//double check that this is the Root object ...
if not GetNumber(k) or (k <> rootNum) then exit;
if not FindStrInDict('/Pages') then exit;
//get Pages object number ==> pagesNum
if not GetNumber(pagesNum) then exit;
k := 0;
while k < PdfObjList.Count do
if PPdfObj(PdfObjList[k]).number = pagesNum then
break else
inc(k);
if k = PdfObjList.Count then exit;
//Pages object found in list, now go to offset ...
p := pchar(ms.Memory) + PPdfObj(PdfObjList[k]).offset;
//make sure it's the Pages object ...
if not GetNumber(k) or (k <> pagesNum) then exit;
if not FindStrInDict('/Count') then exit;
if not GetNumber(cnt) then exit;
//21-Jun-05: bugfix
//occasionally the 'count' value is an indirect object
if GetNumber(k) and IsString(' R') then
begin
//this is an indirect object to the count value,
//so find the obj ...
k := 0;
while k < PdfObjList.Count do
if PPdfObj(PdfObjList[k]).number = cnt then
break else inc(k);
if k = PdfObjList.Count then exit;
p := pchar(ms.Memory) + PPdfObj(PdfObjList[k]).offset;
if not GetNumber(k) or //skip the object num
not GetNumber(k) or //skip the generation num
not IsString(' obj') or
not GetNumber(cnt) then exit;
end;
result := cnt;
finally
screen.Cursor := crDefault;
for k := 0 to PdfObjList.Count -1 do
dispose(PPdfObj(PdfObjList[k]));
PdfObjList.Free;
ms.Free;
end;
except
//nb: errors are flagged by returning -1
end;
end;