Added UmbracoExamine.PDF library and unit tests.
This commit is contained in:
277
src/UmbracoExamine.PDF/PDFIndexer.cs
Normal file
277
src/UmbracoExamine.PDF/PDFIndexer.cs
Normal file
@@ -0,0 +1,277 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Collections.Specialized;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Security;
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Xml.Linq;
|
||||
using Examine;
|
||||
using iTextSharp.text.exceptions;
|
||||
using iTextSharp.text.pdf;
|
||||
using System.Text;
|
||||
using Lucene.Net.Analysis;
|
||||
using UmbracoExamine.DataServices;
|
||||
|
||||
|
||||
namespace UmbracoExamine.PDF
|
||||
{
|
||||
/// <summary>
|
||||
/// An Umbraco Lucene.Net indexer which will index the text content of a file
|
||||
/// </summary>
|
||||
public class PDFIndexer : BaseUmbracoIndexer
|
||||
{
|
||||
#region Constructors
|
||||
|
||||
/// <summary>
|
||||
/// Default constructor
|
||||
/// </summary>
|
||||
public PDFIndexer()
|
||||
{
|
||||
SupportedExtensions = new[] { ".pdf" };
|
||||
UmbracoFileProperty = "umbracoFile";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constructor to allow for creating an indexer at runtime
|
||||
/// </summary>
|
||||
/// <param name="indexPath"></param>
|
||||
/// <param name="dataService"></param>
|
||||
/// <param name="analyzer"></param>
|
||||
/// <param name="async"></param>
|
||||
[SecuritySafeCritical]
|
||||
public PDFIndexer(DirectoryInfo indexPath, IDataService dataService, Analyzer analyzer, bool async)
|
||||
: base(
|
||||
new IndexCriteria(Enumerable.Empty<IIndexField>(), Enumerable.Empty<IIndexField>(), Enumerable.Empty<string>(), Enumerable.Empty<string>(), null),
|
||||
indexPath, dataService, analyzer, async)
|
||||
{
|
||||
SupportedExtensions = new[] { ".pdf" };
|
||||
UmbracoFileProperty = "umbracoFile";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constructor to allow for creating an indexer at runtime
|
||||
/// </summary>
|
||||
/// <param name="luceneDirectory"></param>
|
||||
/// <param name="dataService"></param>
|
||||
/// <param name="analyzer"></param>
|
||||
/// <param name="async"></param>
|
||||
[SecuritySafeCritical]
|
||||
public PDFIndexer(Lucene.Net.Store.Directory luceneDirectory, IDataService dataService, Analyzer analyzer, bool async)
|
||||
: base(
|
||||
new IndexCriteria(Enumerable.Empty<IIndexField>(), Enumerable.Empty<IIndexField>(), Enumerable.Empty<string>(), Enumerable.Empty<string>(), null),
|
||||
luceneDirectory, dataService, analyzer, async)
|
||||
{
|
||||
SupportedExtensions = new[] { ".pdf" };
|
||||
UmbracoFileProperty = "umbracoFile";
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
|
||||
#region Properties
|
||||
/// <summary>
|
||||
/// Gets or sets the supported extensions for files, currently the system will only
|
||||
/// process PDF files.
|
||||
/// </summary>
|
||||
/// <value>The supported extensions.</value>
|
||||
public IEnumerable<string> SupportedExtensions { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the umbraco property alias (defaults to umbracoFile)
|
||||
/// </summary>
|
||||
/// <value>The umbraco file property.</value>
|
||||
public string UmbracoFileProperty { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the name of the Lucene.Net field which the content is inserted into
|
||||
/// </summary>
|
||||
/// <value>The name of the text content field.</value>
|
||||
public const string TextContentFieldName = "FileTextContent";
|
||||
|
||||
protected override IEnumerable<string> SupportedTypes
|
||||
{
|
||||
get
|
||||
{
|
||||
return new string[] { IndexTypes.Media };
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
/// <summary>
|
||||
/// Set up all properties for the indexer based on configuration information specified. This will ensure that
|
||||
/// all of the folders required by the indexer are created and exist. This will also create an instruction
|
||||
/// file declaring the computer name that is part taking in the indexing. This file will then be used to
|
||||
/// determine the master indexer machine in a load balanced environment (if one exists).
|
||||
/// </summary>
|
||||
/// <param name="name"></param>
|
||||
/// <param name="config"></param>
|
||||
public override void Initialize(string name, NameValueCollection config)
|
||||
{
|
||||
base.Initialize(name, config);
|
||||
|
||||
if (!string.IsNullOrEmpty(config["extensions"]))
|
||||
SupportedExtensions = config["extensions"].Split(new[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
|
||||
|
||||
//checks if a custom field alias is specified
|
||||
if (!string.IsNullOrEmpty(config["umbracoFileProperty"]))
|
||||
UmbracoFileProperty = config["umbracoFileProperty"];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Provides the means to extract the text to be indexed from the file specified
|
||||
/// </summary>
|
||||
/// <param name="file"></param>
|
||||
/// <returns></returns>
|
||||
protected virtual string ExtractTextFromFile(FileInfo file)
|
||||
{
|
||||
if (!SupportedExtensions.Select(x => x.ToUpper()).Contains(file.Extension.ToUpper()))
|
||||
{
|
||||
throw new NotSupportedException("The file with the extension specified is not supported");
|
||||
}
|
||||
|
||||
var pdf = new PDFParser();
|
||||
|
||||
Action<Exception> onError = (e) => OnIndexingError(new IndexingErrorEventArgs("Could not read PDF", -1, e));
|
||||
|
||||
var txt = pdf.ParsePdfText(file.FullName, onError);
|
||||
return txt;
|
||||
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Collects all of the data that needs to be indexed as defined in the index set.
|
||||
/// </summary>
|
||||
/// <param name="node">Media item XML being indexed</param>
|
||||
/// <param name="type">Type of index (should only ever be media)</param>
|
||||
/// <returns>Fields containing the data for the index</returns>
|
||||
protected override Dictionary<string, string> GetDataToIndex(XElement node, string type)
|
||||
{
|
||||
var fields = base.GetDataToIndex(node, type);
|
||||
|
||||
//find the field which contains the file
|
||||
var filePath = node.Elements().FirstOrDefault(x =>
|
||||
{
|
||||
if (x.Attribute("alias") != null)
|
||||
return (string)x.Attribute("alias") == this.UmbracoFileProperty;
|
||||
else
|
||||
return x.Name == this.UmbracoFileProperty;
|
||||
});
|
||||
//make sure the file exists
|
||||
if (filePath != default(XElement) && !string.IsNullOrEmpty((string)filePath))
|
||||
{
|
||||
//get the file path from the data service
|
||||
var fullPath = this.DataService.MapPath((string)filePath);
|
||||
var fi = new FileInfo(fullPath);
|
||||
if (fi.Exists)
|
||||
{
|
||||
try
|
||||
{
|
||||
fields.Add(TextContentFieldName, ExtractTextFromFile(fi));
|
||||
}
|
||||
catch (NotSupportedException)
|
||||
{
|
||||
//log that we couldn't index the file found
|
||||
DataService.LogService.AddErrorLog((int)node.Attribute("id"), "UmbracoExamine.FileIndexer: Extension '" + fi.Extension + "' is not supported at this time");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
DataService.LogService.AddInfoLog((int)node.Attribute("id"), "UmbracoExamine.FileIndexer: No file found at path " + filePath);
|
||||
}
|
||||
}
|
||||
|
||||
return fields;
|
||||
}
|
||||
|
||||
#region Internal PDFParser Class
|
||||
|
||||
/// <summary>
|
||||
/// Parses a PDF file and extracts the text from it.
|
||||
/// </summary>
|
||||
internal class PDFParser
|
||||
{
|
||||
|
||||
static PDFParser()
|
||||
{
|
||||
lock (m_Locker)
|
||||
{
|
||||
m_UnsupportedRange = new List<int>();
|
||||
m_UnsupportedRange.AddRange(Enumerable.Range(0x0000, 0x001F));
|
||||
m_UnsupportedRange.Add(0x1F);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static readonly object m_Locker = new object();
|
||||
|
||||
/// <summary>
|
||||
/// Stores the unsupported range of character
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// used as a reference:
|
||||
/// http://www.tamasoft.co.jp/en/general-info/unicode.html
|
||||
/// http://en.wikipedia.org/wiki/Summary_of_Unicode_character_assignments
|
||||
/// http://en.wikipedia.org/wiki/Unicode
|
||||
/// http://en.wikipedia.org/wiki/Basic_Multilingual_Plane
|
||||
/// </remarks>
|
||||
private static List<int> m_UnsupportedRange;
|
||||
|
||||
/// <summary>
|
||||
/// Return only the valid string contents of the PDF
|
||||
/// </summary>
|
||||
/// <param name="sourcePDF"></param>
|
||||
/// <param name="onError"></param>
|
||||
/// <returns></returns>
|
||||
[SecuritySafeCritical]
|
||||
public string ParsePdfText(string sourcePDF, Action<Exception> onError)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
|
||||
var reader = new PdfReader(sourcePDF);
|
||||
PRTokeniser token = null;
|
||||
var tknValue = String.Empty;
|
||||
|
||||
for (var i = 1; (i <= reader.NumberOfPages); i++)
|
||||
{
|
||||
var pageBytes = reader.GetPageContent(i);
|
||||
if (pageBytes != null)
|
||||
{
|
||||
token = new PRTokeniser(pageBytes);
|
||||
try
|
||||
{
|
||||
while (token.NextToken())
|
||||
{
|
||||
var tknType = token.TokenType;
|
||||
tknValue = token.StringValue;
|
||||
if ((tknType == PRTokeniser.TokType.STRING))
|
||||
{
|
||||
foreach (var s in tknValue)
|
||||
{
|
||||
//strip out unsupported characters, based on unicode tables.
|
||||
if (!m_UnsupportedRange.Contains(s))
|
||||
{
|
||||
sb.Append(s);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (InvalidPdfException ex)
|
||||
{
|
||||
onError(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
||||
31
src/UmbracoExamine.PDF/Properties/AssemblyInfo.cs
Normal file
31
src/UmbracoExamine.PDF/Properties/AssemblyInfo.cs
Normal file
@@ -0,0 +1,31 @@
|
||||
using System.Reflection;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Security;
|
||||
|
||||
// General Information about an assembly is controlled through the following
|
||||
// set of attributes. Change these attribute values to modify the information
|
||||
// associated with an assembly.
|
||||
[assembly: AssemblyCompany("umbraco")]
|
||||
[assembly: AssemblyCopyright("Copyright © Umbraco 2012")]
|
||||
[assembly: AssemblyTrademark("")]
|
||||
[assembly: AssemblyCulture("")]
|
||||
[assembly: AssemblyTitle("UmbracoExamine.PDF")]
|
||||
[assembly: AssemblyDescription("Umbraco index providers for PDF based on the Examine model using Lucene.NET 2.9.2")]
|
||||
[assembly: AssemblyConfiguration("")]
|
||||
[assembly: AssemblyProduct("UmbracoExamine.PDF")]
|
||||
|
||||
// Setting ComVisible to false makes the types in this assembly not visible
|
||||
// to COM components. If you need to access a type in this assembly from
|
||||
// COM, set the ComVisible attribute to true on that type.
|
||||
[assembly: ComVisible(false)]
|
||||
|
||||
// The following GUID is for the ID of the typelib if this project is exposed to COM
|
||||
[assembly: Guid("8933a78d-8414-4c72-a74d-76aa7fb0e9ad")]
|
||||
|
||||
//NOTE: WE cannot make change the major version to be the same as Umbraco because of backwards compatibility, however we
|
||||
// will make the minor version the same as the umbraco version
|
||||
[assembly: AssemblyVersion("0.6.0.*")]
|
||||
[assembly: AssemblyFileVersion("0.6.0.*")]
|
||||
|
||||
[assembly: AllowPartiallyTrustedCallers]
|
||||
89
src/UmbracoExamine.PDF/UmbracoExamine.PDF.csproj
Normal file
89
src/UmbracoExamine.PDF/UmbracoExamine.PDF.csproj
Normal file
@@ -0,0 +1,89 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<PropertyGroup>
|
||||
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
|
||||
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
|
||||
<ProductVersion>8.0.30703</ProductVersion>
|
||||
<SchemaVersion>2.0</SchemaVersion>
|
||||
<ProjectGuid>{F30DDDB8-3994-4673-82AE-057123C6E1A8}</ProjectGuid>
|
||||
<OutputType>Library</OutputType>
|
||||
<AppDesignerFolder>Properties</AppDesignerFolder>
|
||||
<RootNamespace>UmbracoExamine.PDF</RootNamespace>
|
||||
<AssemblyName>UmbracoExamine.PDF</AssemblyName>
|
||||
<TargetFrameworkVersion>v4.0</TargetFrameworkVersion>
|
||||
<FileAlignment>512</FileAlignment>
|
||||
<SccProjectName>
|
||||
</SccProjectName>
|
||||
<SccLocalPath>
|
||||
</SccLocalPath>
|
||||
<SccAuxPath>
|
||||
</SccAuxPath>
|
||||
<SccProvider>
|
||||
</SccProvider>
|
||||
<TargetFrameworkProfile />
|
||||
<SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\</SolutionDir>
|
||||
<RestorePackages>true</RestorePackages>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
|
||||
<DebugSymbols>true</DebugSymbols>
|
||||
<DebugType>full</DebugType>
|
||||
<Optimize>false</Optimize>
|
||||
<OutputPath>bin\Debug\</OutputPath>
|
||||
<DefineConstants>DEBUG;TRACE</DefineConstants>
|
||||
<ErrorReport>prompt</ErrorReport>
|
||||
<WarningLevel>4</WarningLevel>
|
||||
<CodeAnalysisRuleSet>SecurityRules.ruleset</CodeAnalysisRuleSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
|
||||
<DebugType>pdbonly</DebugType>
|
||||
<Optimize>true</Optimize>
|
||||
<OutputPath>bin\Release\</OutputPath>
|
||||
<DefineConstants>TRACE</DefineConstants>
|
||||
<ErrorReport>prompt</ErrorReport>
|
||||
<WarningLevel>4</WarningLevel>
|
||||
<DocumentationFile>bin\Release\UmbracoExamine.PDF.XML</DocumentationFile>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="Examine">
|
||||
<HintPath>..\packages\Examine.0.1.42.2941\lib\Examine.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="ICSharpCode.SharpZipLib">
|
||||
<HintPath>..\packages\SharpZipLib.0.86.0\lib\20\ICSharpCode.SharpZipLib.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="itextsharp">
|
||||
<HintPath>..\packages\iTextSharp.5.3.3\lib\itextsharp.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Lucene.Net">
|
||||
<HintPath>..\packages\Lucene.Net.2.9.4.1\lib\net40\Lucene.Net.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="System" />
|
||||
<Reference Include="System.configuration" />
|
||||
<Reference Include="System.Core" />
|
||||
<Reference Include="System.Xml.Linq" />
|
||||
<Reference Include="System.Data.DataSetExtensions" />
|
||||
<Reference Include="System.Data" />
|
||||
<Reference Include="System.Xml" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Compile Include="PDFIndexer.cs" />
|
||||
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\UmbracoExamine\UmbracoExamine.csproj">
|
||||
<Project>{07fbc26b-2927-4a22-8d96-d644c667fecc}</Project>
|
||||
<Name>UmbracoExamine</Name>
|
||||
</ProjectReference>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="packages.config" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
||||
<Import Project="$(SolutionDir)\.nuget\nuget.targets" />
|
||||
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
|
||||
Other similar extension points exist, see Microsoft.Common.targets.
|
||||
<Target Name="BeforeBuild">
|
||||
</Target>
|
||||
<Target Name="AfterBuild">
|
||||
</Target>
|
||||
-->
|
||||
</Project>
|
||||
7
src/UmbracoExamine.PDF/packages.config
Normal file
7
src/UmbracoExamine.PDF/packages.config
Normal file
@@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="Examine" version="0.1.42.2941" targetFramework="net40" />
|
||||
<package id="iTextSharp" version="5.3.3" targetFramework="net40" />
|
||||
<package id="Lucene.Net" version="2.9.4.1" targetFramework="net40" />
|
||||
<package id="SharpZipLib" version="0.86.0" targetFramework="net40" />
|
||||
</packages>
|
||||
Reference in New Issue
Block a user