Added UmbracoExamine.PDF library and unit tests.

This commit is contained in:
Shannon Deminick
2013-01-05 04:43:15 +03:00
parent d1bf2c0ad7
commit 6513097d38
11 changed files with 523 additions and 10 deletions

View File

@@ -0,0 +1,277 @@
using System;
using System.Collections.Generic;
using System.Collections.Specialized;
using System.IO;
using System.Linq;
using System.Security;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using Examine;
using iTextSharp.text.exceptions;
using iTextSharp.text.pdf;
using System.Text;
using Lucene.Net.Analysis;
using UmbracoExamine.DataServices;
namespace UmbracoExamine.PDF
{
/// <summary>
/// An Umbraco Lucene.Net indexer which will index the text content of a file
/// </summary>
public class PDFIndexer : BaseUmbracoIndexer
{
#region Constructors
/// <summary>
/// Default constructor
/// </summary>
public PDFIndexer()
{
SupportedExtensions = new[] { ".pdf" };
UmbracoFileProperty = "umbracoFile";
}
/// <summary>
/// Constructor to allow for creating an indexer at runtime
/// </summary>
/// <param name="indexPath"></param>
/// <param name="dataService"></param>
/// <param name="analyzer"></param>
/// <param name="async"></param>
[SecuritySafeCritical]
public PDFIndexer(DirectoryInfo indexPath, IDataService dataService, Analyzer analyzer, bool async)
: base(
new IndexCriteria(Enumerable.Empty<IIndexField>(), Enumerable.Empty<IIndexField>(), Enumerable.Empty<string>(), Enumerable.Empty<string>(), null),
indexPath, dataService, analyzer, async)
{
SupportedExtensions = new[] { ".pdf" };
UmbracoFileProperty = "umbracoFile";
}
/// <summary>
/// Constructor to allow for creating an indexer at runtime
/// </summary>
/// <param name="luceneDirectory"></param>
/// <param name="dataService"></param>
/// <param name="analyzer"></param>
/// <param name="async"></param>
[SecuritySafeCritical]
public PDFIndexer(Lucene.Net.Store.Directory luceneDirectory, IDataService dataService, Analyzer analyzer, bool async)
: base(
new IndexCriteria(Enumerable.Empty<IIndexField>(), Enumerable.Empty<IIndexField>(), Enumerable.Empty<string>(), Enumerable.Empty<string>(), null),
luceneDirectory, dataService, analyzer, async)
{
SupportedExtensions = new[] { ".pdf" };
UmbracoFileProperty = "umbracoFile";
}
#endregion
#region Properties
/// <summary>
/// Gets or sets the supported extensions for files, currently the system will only
/// process PDF files.
/// </summary>
/// <value>The supported extensions.</value>
public IEnumerable<string> SupportedExtensions { get; set; }
/// <summary>
/// Gets or sets the umbraco property alias (defaults to umbracoFile)
/// </summary>
/// <value>The umbraco file property.</value>
public string UmbracoFileProperty { get; set; }
/// <summary>
/// Gets the name of the Lucene.Net field which the content is inserted into
/// </summary>
/// <value>The name of the text content field.</value>
public const string TextContentFieldName = "FileTextContent";
protected override IEnumerable<string> SupportedTypes
{
get
{
return new string[] { IndexTypes.Media };
}
}
#endregion
/// <summary>
/// Set up all properties for the indexer based on configuration information specified. This will ensure that
/// all of the folders required by the indexer are created and exist. This will also create an instruction
/// file declaring the computer name that is part taking in the indexing. This file will then be used to
/// determine the master indexer machine in a load balanced environment (if one exists).
/// </summary>
/// <param name="name"></param>
/// <param name="config"></param>
public override void Initialize(string name, NameValueCollection config)
{
base.Initialize(name, config);
if (!string.IsNullOrEmpty(config["extensions"]))
SupportedExtensions = config["extensions"].Split(new[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
//checks if a custom field alias is specified
if (!string.IsNullOrEmpty(config["umbracoFileProperty"]))
UmbracoFileProperty = config["umbracoFileProperty"];
}
/// <summary>
/// Provides the means to extract the text to be indexed from the file specified
/// </summary>
/// <param name="file"></param>
/// <returns></returns>
protected virtual string ExtractTextFromFile(FileInfo file)
{
if (!SupportedExtensions.Select(x => x.ToUpper()).Contains(file.Extension.ToUpper()))
{
throw new NotSupportedException("The file with the extension specified is not supported");
}
var pdf = new PDFParser();
Action<Exception> onError = (e) => OnIndexingError(new IndexingErrorEventArgs("Could not read PDF", -1, e));
var txt = pdf.ParsePdfText(file.FullName, onError);
return txt;
}
/// <summary>
/// Collects all of the data that needs to be indexed as defined in the index set.
/// </summary>
/// <param name="node">Media item XML being indexed</param>
/// <param name="type">Type of index (should only ever be media)</param>
/// <returns>Fields containing the data for the index</returns>
protected override Dictionary<string, string> GetDataToIndex(XElement node, string type)
{
var fields = base.GetDataToIndex(node, type);
//find the field which contains the file
var filePath = node.Elements().FirstOrDefault(x =>
{
if (x.Attribute("alias") != null)
return (string)x.Attribute("alias") == this.UmbracoFileProperty;
else
return x.Name == this.UmbracoFileProperty;
});
//make sure the file exists
if (filePath != default(XElement) && !string.IsNullOrEmpty((string)filePath))
{
//get the file path from the data service
var fullPath = this.DataService.MapPath((string)filePath);
var fi = new FileInfo(fullPath);
if (fi.Exists)
{
try
{
fields.Add(TextContentFieldName, ExtractTextFromFile(fi));
}
catch (NotSupportedException)
{
//log that we couldn't index the file found
DataService.LogService.AddErrorLog((int)node.Attribute("id"), "UmbracoExamine.FileIndexer: Extension '" + fi.Extension + "' is not supported at this time");
}
}
else
{
DataService.LogService.AddInfoLog((int)node.Attribute("id"), "UmbracoExamine.FileIndexer: No file found at path " + filePath);
}
}
return fields;
}
#region Internal PDFParser Class
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
internal class PDFParser
{
static PDFParser()
{
lock (m_Locker)
{
m_UnsupportedRange = new List<int>();
m_UnsupportedRange.AddRange(Enumerable.Range(0x0000, 0x001F));
m_UnsupportedRange.Add(0x1F);
}
}
private static readonly object m_Locker = new object();
/// <summary>
/// Stores the unsupported range of character
/// </summary>
/// <remarks>
/// used as a reference:
/// http://www.tamasoft.co.jp/en/general-info/unicode.html
/// http://en.wikipedia.org/wiki/Summary_of_Unicode_character_assignments
/// http://en.wikipedia.org/wiki/Unicode
/// http://en.wikipedia.org/wiki/Basic_Multilingual_Plane
/// </remarks>
private static List<int> m_UnsupportedRange;
/// <summary>
/// Return only the valid string contents of the PDF
/// </summary>
/// <param name="sourcePDF"></param>
/// <param name="onError"></param>
/// <returns></returns>
[SecuritySafeCritical]
public string ParsePdfText(string sourcePDF, Action<Exception> onError)
{
var sb = new StringBuilder();
var reader = new PdfReader(sourcePDF);
PRTokeniser token = null;
var tknValue = String.Empty;
for (var i = 1; (i <= reader.NumberOfPages); i++)
{
var pageBytes = reader.GetPageContent(i);
if (pageBytes != null)
{
token = new PRTokeniser(pageBytes);
try
{
while (token.NextToken())
{
var tknType = token.TokenType;
tknValue = token.StringValue;
if ((tknType == PRTokeniser.TokType.STRING))
{
foreach (var s in tknValue)
{
//strip out unsupported characters, based on unicode tables.
if (!m_UnsupportedRange.Contains(s))
{
sb.Append(s);
}
}
}
}
}
catch (InvalidPdfException ex)
{
onError(ex);
}
}
}
return sb.ToString();
}
}
#endregion
}
}

View File

@@ -0,0 +1,31 @@
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Security;
// General Information about an assembly is controlled through the following
// set of attributes. Change these attribute values to modify the information
// associated with an assembly.
[assembly: AssemblyCompany("umbraco")]
[assembly: AssemblyCopyright("Copyright © Umbraco 2012")]
[assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")]
[assembly: AssemblyTitle("UmbracoExamine.PDF")]
[assembly: AssemblyDescription("Umbraco index providers for PDF based on the Examine model using Lucene.NET 2.9.2")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyProduct("UmbracoExamine.PDF")]
// Setting ComVisible to false makes the types in this assembly not visible
// to COM components. If you need to access a type in this assembly from
// COM, set the ComVisible attribute to true on that type.
[assembly: ComVisible(false)]
// The following GUID is for the ID of the typelib if this project is exposed to COM
[assembly: Guid("8933a78d-8414-4c72-a74d-76aa7fb0e9ad")]
//NOTE: WE cannot make change the major version to be the same as Umbraco because of backwards compatibility, however we
// will make the minor version the same as the umbraco version
[assembly: AssemblyVersion("0.6.0.*")]
[assembly: AssemblyFileVersion("0.6.0.*")]
[assembly: AllowPartiallyTrustedCallers]

View File

@@ -0,0 +1,89 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProductVersion>8.0.30703</ProductVersion>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>{F30DDDB8-3994-4673-82AE-057123C6E1A8}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>UmbracoExamine.PDF</RootNamespace>
<AssemblyName>UmbracoExamine.PDF</AssemblyName>
<TargetFrameworkVersion>v4.0</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<SccProjectName>
</SccProjectName>
<SccLocalPath>
</SccLocalPath>
<SccAuxPath>
</SccAuxPath>
<SccProvider>
</SccProvider>
<TargetFrameworkProfile />
<SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\</SolutionDir>
<RestorePackages>true</RestorePackages>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<CodeAnalysisRuleSet>SecurityRules.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<DocumentationFile>bin\Release\UmbracoExamine.PDF.XML</DocumentationFile>
</PropertyGroup>
<ItemGroup>
<Reference Include="Examine">
<HintPath>..\packages\Examine.0.1.42.2941\lib\Examine.dll</HintPath>
</Reference>
<Reference Include="ICSharpCode.SharpZipLib">
<HintPath>..\packages\SharpZipLib.0.86.0\lib\20\ICSharpCode.SharpZipLib.dll</HintPath>
</Reference>
<Reference Include="itextsharp">
<HintPath>..\packages\iTextSharp.5.3.3\lib\itextsharp.dll</HintPath>
</Reference>
<Reference Include="Lucene.Net">
<HintPath>..\packages\Lucene.Net.2.9.4.1\lib\net40\Lucene.Net.dll</HintPath>
</Reference>
<Reference Include="System" />
<Reference Include="System.configuration" />
<Reference Include="System.Core" />
<Reference Include="System.Xml.Linq" />
<Reference Include="System.Data.DataSetExtensions" />
<Reference Include="System.Data" />
<Reference Include="System.Xml" />
</ItemGroup>
<ItemGroup>
<Compile Include="PDFIndexer.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\UmbracoExamine\UmbracoExamine.csproj">
<Project>{07fbc26b-2927-4a22-8d96-d644c667fecc}</Project>
<Name>UmbracoExamine</Name>
</ProjectReference>
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<Import Project="$(SolutionDir)\.nuget\nuget.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
</Project>

View File

@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="Examine" version="0.1.42.2941" targetFramework="net40" />
<package id="iTextSharp" version="5.3.3" targetFramework="net40" />
<package id="Lucene.Net" version="2.9.4.1" targetFramework="net40" />
<package id="SharpZipLib" version="0.86.0" targetFramework="net40" />
</packages>