Files
Umbraco-CMS/src/UmbracoExamine.PDF/PDFIndexer.cs
Shannon Deminick 545a156942 Got most of #U4-1503 - External UmbracoExamine libraries built separately from the web app with the Build.bat command. Now
just need to get the output zipped up and then create some Nuget packages with dependencies.
2013-02-02 06:08:14 +06:00

276 lines
10 KiB
C#

using System;
using System.Collections.Generic;
using System.Collections.Specialized;
using System.IO;
using System.Linq;
using System.Security;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using Examine;
using iTextSharp.text.exceptions;
using iTextSharp.text.pdf;
using System.Text;
using Lucene.Net.Analysis;
using UmbracoExamine.DataServices;
namespace UmbracoExamine.PDF
{
/// <summary>
/// An Umbraco Lucene.Net indexer which will index the text content of a file
/// </summary>
public class PDFIndexer : BaseUmbracoIndexer
{
#region Constructors
/// <summary>
/// Default constructor
/// </summary>
public PDFIndexer()
{
SupportedExtensions = new[] { ".pdf" };
UmbracoFileProperty = "umbracoFile";
}
/// <summary>
/// Constructor to allow for creating an indexer at runtime
/// </summary>
/// <param name="indexPath"></param>
/// <param name="dataService"></param>
/// <param name="analyzer"></param>
/// <param name="async"></param>
[SecuritySafeCritical]
public PDFIndexer(DirectoryInfo indexPath, IDataService dataService, Analyzer analyzer, bool async)
: base(
new IndexCriteria(Enumerable.Empty<IIndexField>(), Enumerable.Empty<IIndexField>(), Enumerable.Empty<string>(), Enumerable.Empty<string>(), null),
indexPath, dataService, analyzer, async)
{
SupportedExtensions = new[] { ".pdf" };
UmbracoFileProperty = "umbracoFile";
}
/// <summary>
/// Constructor to allow for creating an indexer at runtime
/// </summary>
/// <param name="luceneDirectory"></param>
/// <param name="dataService"></param>
/// <param name="analyzer"></param>
/// <param name="async"></param>
[SecuritySafeCritical]
public PDFIndexer(Lucene.Net.Store.Directory luceneDirectory, IDataService dataService, Analyzer analyzer, bool async)
: base(
new IndexCriteria(Enumerable.Empty<IIndexField>(), Enumerable.Empty<IIndexField>(), Enumerable.Empty<string>(), Enumerable.Empty<string>(), null),
luceneDirectory, dataService, analyzer, async)
{
SupportedExtensions = new[] { ".pdf" };
UmbracoFileProperty = "umbracoFile";
}
#endregion
#region Properties
/// <summary>
/// Gets or sets the supported extensions for files, currently the system will only
/// process PDF files.
/// </summary>
/// <value>The supported extensions.</value>
public IEnumerable<string> SupportedExtensions { get; set; }
/// <summary>
/// Gets or sets the umbraco property alias (defaults to umbracoFile)
/// </summary>
/// <value>The umbraco file property.</value>
public string UmbracoFileProperty { get; set; }
/// <summary>
/// Gets the name of the Lucene.Net field which the content is inserted into
/// </summary>
/// <value>The name of the text content field.</value>
public const string TextContentFieldName = "FileTextContent";
protected override IEnumerable<string> SupportedTypes
{
get
{
return new string[] { IndexTypes.Media };
}
}
#endregion
/// <summary>
/// Set up all properties for the indexer based on configuration information specified. This will ensure that
/// all of the folders required by the indexer are created and exist.
/// </summary>
/// <param name="name"></param>
/// <param name="config"></param>
public override void Initialize(string name, NameValueCollection config)
{
base.Initialize(name, config);
if (!string.IsNullOrEmpty(config["extensions"]))
SupportedExtensions = config["extensions"].Split(new[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
//checks if a custom field alias is specified
if (!string.IsNullOrEmpty(config["umbracoFileProperty"]))
UmbracoFileProperty = config["umbracoFileProperty"];
}
/// <summary>
/// Provides the means to extract the text to be indexed from the file specified
/// </summary>
/// <param name="file"></param>
/// <returns></returns>
protected virtual string ExtractTextFromFile(FileInfo file)
{
if (!SupportedExtensions.Select(x => x.ToUpper()).Contains(file.Extension.ToUpper()))
{
throw new NotSupportedException("The file with the extension specified is not supported");
}
var pdf = new PDFParser();
Action<Exception> onError = (e) => OnIndexingError(new IndexingErrorEventArgs("Could not read PDF", -1, e));
var txt = pdf.ParsePdfText(file.FullName, onError);
return txt;
}
/// <summary>
/// Collects all of the data that needs to be indexed as defined in the index set.
/// </summary>
/// <param name="node">Media item XML being indexed</param>
/// <param name="type">Type of index (should only ever be media)</param>
/// <returns>Fields containing the data for the index</returns>
protected override Dictionary<string, string> GetDataToIndex(XElement node, string type)
{
var fields = base.GetDataToIndex(node, type);
//find the field which contains the file
var filePath = node.Elements().FirstOrDefault(x =>
{
if (x.Attribute("alias") != null)
return (string)x.Attribute("alias") == this.UmbracoFileProperty;
else
return x.Name == this.UmbracoFileProperty;
});
//make sure the file exists
if (filePath != default(XElement) && !string.IsNullOrEmpty((string)filePath))
{
//get the file path from the data service
var fullPath = this.DataService.MapPath((string)filePath);
var fi = new FileInfo(fullPath);
if (fi.Exists)
{
try
{
fields.Add(TextContentFieldName, ExtractTextFromFile(fi));
}
catch (NotSupportedException)
{
//log that we couldn't index the file found
DataService.LogService.AddErrorLog((int)node.Attribute("id"), "UmbracoExamine.FileIndexer: Extension '" + fi.Extension + "' is not supported at this time");
}
}
else
{
DataService.LogService.AddInfoLog((int)node.Attribute("id"), "UmbracoExamine.FileIndexer: No file found at path " + filePath);
}
}
return fields;
}
#region Internal PDFParser Class
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
internal class PDFParser
{
static PDFParser()
{
lock (m_Locker)
{
m_UnsupportedRange = new List<int>();
m_UnsupportedRange.AddRange(Enumerable.Range(0x0000, 0x001F));
m_UnsupportedRange.Add(0x1F);
}
}
private static readonly object m_Locker = new object();
/// <summary>
/// Stores the unsupported range of character
/// </summary>
/// <remarks>
/// used as a reference:
/// http://www.tamasoft.co.jp/en/general-info/unicode.html
/// http://en.wikipedia.org/wiki/Summary_of_Unicode_character_assignments
/// http://en.wikipedia.org/wiki/Unicode
/// http://en.wikipedia.org/wiki/Basic_Multilingual_Plane
/// </remarks>
private static List<int> m_UnsupportedRange;
/// <summary>
/// Return only the valid string contents of the PDF
/// </summary>
/// <param name="sourcePDF"></param>
/// <param name="onError"></param>
/// <returns></returns>
[SecuritySafeCritical]
public string ParsePdfText(string sourcePDF, Action<Exception> onError)
{
var sb = new StringBuilder();
var reader = new PdfReader(sourcePDF);
PRTokeniser token = null;
var tknValue = String.Empty;
for (var i = 1; (i <= reader.NumberOfPages); i++)
{
var pageBytes = reader.GetPageContent(i);
if (pageBytes != null)
{
token = new PRTokeniser(pageBytes);
try
{
while (token.NextToken())
{
var tknType = token.TokenType;
tknValue = token.StringValue;
if ((tknType == PRTokeniser.TokType.STRING))
{
foreach (var s in tknValue)
{
//strip out unsupported characters, based on unicode tables.
if (!m_UnsupportedRange.Contains(s))
{
sb.Append(s);
}
}
}
}
}
catch (InvalidPdfException ex)
{
onError(ex);
}
}
}
return sb.ToString();
}
}
#endregion
}
}