We had similar problem on our project recently and we've implemented custom text extractor for PDFs, which seems to be working fine so far.
using System;
using System.IO;
using CMS.Base;
using CMS.DataEngine;
using CMS.Search;
using org.apache.pdfbox.pdmodel;
using org.apache.pdfbox.util;
public class CustomSearchTextExtractor : ISearchTextExtractor
{
public CMS.Base.XmlData ExtractContent(CMS.Core.BinaryData data, ExtractionContext context)
{
string result = String.Empty;
string tempPath = Path.Combine(Path.GetTempPath(), Path.GetTempFileName()) + ".pdf";
PDDocument doc = null;
try
{
File.WriteAllBytes(tempPath, data.Data);
doc = PDDocument.load(tempPath);
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(doc);
}
finally
{
if (doc != null)
{
doc.close();
}
if (File.Exists(tempPath))
{
File.Delete(tempPath);
}
}
var content = new XmlData();
content.SetValue(SearchFieldsConstants.CONTENT, result.ToString());
return content;
}
}