Added UmbracoExamine.PDF library and unit tests.

This commit is contained in:
Shannon Deminick
2013-01-05 04:43:15 +03:00
parent d1bf2c0ad7
commit 6513097d38
11 changed files with 523 additions and 10 deletions

View File

@@ -271,6 +271,7 @@
<Compile Include="UmbracoExamine\EventsTest.cs" />
<Compile Include="UmbracoExamine\IndexInitializer.cs" />
<Compile Include="UmbracoExamine\IndexTest.cs" />
<Compile Include="UmbracoExamine\PdfIndexerTests.cs" />
<Compile Include="UmbracoExamine\TestDataService.cs" />
<Compile Include="UmbracoExamine\TestFiles.Designer.cs">
<AutoGen>True</AutoGen>
@@ -358,6 +359,10 @@
<Project>{651E1350-91B6-44B7-BD60-7207006D7003}</Project>
<Name>Umbraco.Web</Name>
</ProjectReference>
<ProjectReference Include="..\UmbracoExamine.PDF\UmbracoExamine.PDF.csproj">
<Project>{f30dddb8-3994-4673-82ae-057123c6e1a8}</Project>
<Name>UmbracoExamine.PDF</Name>
</ProjectReference>
<ProjectReference Include="..\UmbracoExamine\UmbracoExamine.csproj">
<Project>{07fbc26b-2927-4a22-8d96-d644c667fecc}</Project>
<Name>UmbracoExamine</Name>

View File

@@ -3,6 +3,7 @@ using Examine;
using Examine.LuceneEngine.Providers;
using Lucene.Net.Analysis.Standard;
using UmbracoExamine;
using UmbracoExamine.PDF;
namespace Umbraco.Tests.UmbracoExamine
{
@@ -100,17 +101,17 @@ namespace Umbraco.Tests.UmbracoExamine
{
return new LuceneSearcher(luceneDir, new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29));
}
//public static PDFIndexer GetPdfIndexer(Lucene.Net.Store.Directory luceneDir)
//{
// var i = new PDFIndexer(luceneDir,
// new TestDataService(),
// new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29),
// false);
public static PDFIndexer GetPdfIndexer(Lucene.Net.Store.Directory luceneDir)
{
var i = new PDFIndexer(luceneDir,
new TestDataService(),
new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29),
false);
// i.IndexingError += IndexingError;
i.IndexingError += IndexingError;
// return i;
//}
return i;
}
public static MultiIndexSearcher GetMultiSearcher(Lucene.Net.Store.Directory pdfDir, Lucene.Net.Store.Directory simpleDir, Lucene.Net.Store.Directory conventionDir, Lucene.Net.Store.Directory cwsDir)
{
var i = new MultiIndexSearcher(new[] { pdfDir, simpleDir, conventionDir, cwsDir }, new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29));

View File

@@ -0,0 +1,93 @@
using System.Diagnostics;
using System.Linq;
using System.Xml.Linq;
using Examine;
using Lucene.Net.Search;
using Lucene.Net.Store;
using NUnit.Framework;
using UmbracoExamine;
using UmbracoExamine.PDF;
namespace Umbraco.Tests.UmbracoExamine
{
[TestFixture]
public class PdfIndexerTests
{
private readonly TestMediaService _mediaService = new TestMediaService();
private static PDFIndexer _indexer;
private static UmbracoExamineSearcher _searcher;
private Lucene.Net.Store.Directory _luceneDir;
[SetUp]
public void Initialize()
{
_luceneDir = new RAMDirectory();
_indexer = IndexInitializer.GetPdfIndexer(_luceneDir);
_indexer.RebuildIndex();
_searcher = IndexInitializer.GetUmbracoSearcher(_luceneDir);
}
[TearDown]
public void TearDown()
{
_luceneDir.Dispose();
}
[Test]
public void PDFIndexer_Ensure_ParentID_Honored()
{
//change parent id to 1116
var existingCriteria = ((IndexCriteria)_indexer.IndexerData);
_indexer.IndexerData = new IndexCriteria(existingCriteria.StandardFields, existingCriteria.UserFields, existingCriteria.IncludeNodeTypes, existingCriteria.ExcludeNodeTypes,
1116);
//get the 2112 pdf node: 2112
var node = _mediaService.GetLatestMediaByXpath("//*[string-length(@id)>0 and number(@id)>0]")
.Root
.Elements()
.Where(x => (int)x.Attribute("id") == 2112)
.First();
//create a copy of 2112 undneath 1111 which is 'not indexable'
var newpdf = XElement.Parse(node.ToString());
newpdf.SetAttributeValue("id", "999999");
newpdf.SetAttributeValue("path", "-1,1111,999999");
newpdf.SetAttributeValue("parentID", "1111");
//now reindex
_indexer.ReIndexNode(newpdf, IndexTypes.Media);
//make sure it doesn't exist
var results = _searcher.Search(_searcher.CreateSearchCriteria().Id(999999).Compile());
Assert.AreEqual(0, results.Count());
}
[Test]
public void PDFIndexer_Reindex()
{
//get searcher and reader to get stats
var r = ((IndexSearcher)_searcher.GetSearcher()).GetIndexReader();
Trace.Write("Num docs = " + r.NumDocs().ToString());
Assert.AreEqual(7, r.NumDocs());
//search the pdf content to ensure it's there
Assert.IsTrue(_searcher.Search(_searcher.CreateSearchCriteria().Id(1113).Compile()).Single()
.Fields[PDFIndexer.TextContentFieldName].Contains("EncapsulateField"));
Assert.IsTrue(_searcher.Search(_searcher.CreateSearchCriteria().Id(1114).Compile()).Single()
.Fields[PDFIndexer.TextContentFieldName].Contains("metaphysical realism"));
//the contour PDF cannot be read properly, this is to due with the PDF encoding!
//Assert.IsTrue(s.Search(s.CreateSearchCriteria().Id(1115).Compile()).Single()
// .Fields[PDFIndexer.TextContentFieldName].Contains("Returns All records from the form with the id"));
Assert.IsTrue(_searcher.Search(_searcher.CreateSearchCriteria().Id(1116).Compile()).Single()
.Fields[PDFIndexer.TextContentFieldName].Contains("What long-term preservation"));
}
}
}

View File

@@ -0,0 +1,277 @@
using System;
using System.Collections.Generic;
using System.Collections.Specialized;
using System.IO;
using System.Linq;
using System.Security;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using Examine;
using iTextSharp.text.exceptions;
using iTextSharp.text.pdf;
using System.Text;
using Lucene.Net.Analysis;
using UmbracoExamine.DataServices;
namespace UmbracoExamine.PDF
{
/// <summary>
/// An Umbraco Lucene.Net indexer which will index the text content of a file
/// </summary>
public class PDFIndexer : BaseUmbracoIndexer
{
#region Constructors
/// <summary>
/// Default constructor
/// </summary>
public PDFIndexer()
{
SupportedExtensions = new[] { ".pdf" };
UmbracoFileProperty = "umbracoFile";
}
/// <summary>
/// Constructor to allow for creating an indexer at runtime
/// </summary>
/// <param name="indexPath"></param>
/// <param name="dataService"></param>
/// <param name="analyzer"></param>
/// <param name="async"></param>
[SecuritySafeCritical]
public PDFIndexer(DirectoryInfo indexPath, IDataService dataService, Analyzer analyzer, bool async)
: base(
new IndexCriteria(Enumerable.Empty<IIndexField>(), Enumerable.Empty<IIndexField>(), Enumerable.Empty<string>(), Enumerable.Empty<string>(), null),
indexPath, dataService, analyzer, async)
{
SupportedExtensions = new[] { ".pdf" };
UmbracoFileProperty = "umbracoFile";
}
/// <summary>
/// Constructor to allow for creating an indexer at runtime
/// </summary>
/// <param name="luceneDirectory"></param>
/// <param name="dataService"></param>
/// <param name="analyzer"></param>
/// <param name="async"></param>
[SecuritySafeCritical]
public PDFIndexer(Lucene.Net.Store.Directory luceneDirectory, IDataService dataService, Analyzer analyzer, bool async)
: base(
new IndexCriteria(Enumerable.Empty<IIndexField>(), Enumerable.Empty<IIndexField>(), Enumerable.Empty<string>(), Enumerable.Empty<string>(), null),
luceneDirectory, dataService, analyzer, async)
{
SupportedExtensions = new[] { ".pdf" };
UmbracoFileProperty = "umbracoFile";
}
#endregion
#region Properties
/// <summary>
/// Gets or sets the supported extensions for files, currently the system will only
/// process PDF files.
/// </summary>
/// <value>The supported extensions.</value>
public IEnumerable<string> SupportedExtensions { get; set; }
/// <summary>
/// Gets or sets the umbraco property alias (defaults to umbracoFile)
/// </summary>
/// <value>The umbraco file property.</value>
public string UmbracoFileProperty { get; set; }
/// <summary>
/// Gets the name of the Lucene.Net field which the content is inserted into
/// </summary>
/// <value>The name of the text content field.</value>
public const string TextContentFieldName = "FileTextContent";
protected override IEnumerable<string> SupportedTypes
{
get
{
return new string[] { IndexTypes.Media };
}
}
#endregion
/// <summary>
/// Set up all properties for the indexer based on configuration information specified. This will ensure that
/// all of the folders required by the indexer are created and exist. This will also create an instruction
/// file declaring the computer name that is part taking in the indexing. This file will then be used to
/// determine the master indexer machine in a load balanced environment (if one exists).
/// </summary>
/// <param name="name"></param>
/// <param name="config"></param>
public override void Initialize(string name, NameValueCollection config)
{
base.Initialize(name, config);
if (!string.IsNullOrEmpty(config["extensions"]))
SupportedExtensions = config["extensions"].Split(new[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
//checks if a custom field alias is specified
if (!string.IsNullOrEmpty(config["umbracoFileProperty"]))
UmbracoFileProperty = config["umbracoFileProperty"];
}
/// <summary>
/// Provides the means to extract the text to be indexed from the file specified
/// </summary>
/// <param name="file"></param>
/// <returns></returns>
protected virtual string ExtractTextFromFile(FileInfo file)
{
if (!SupportedExtensions.Select(x => x.ToUpper()).Contains(file.Extension.ToUpper()))
{
throw new NotSupportedException("The file with the extension specified is not supported");
}
var pdf = new PDFParser();
Action<Exception> onError = (e) => OnIndexingError(new IndexingErrorEventArgs("Could not read PDF", -1, e));
var txt = pdf.ParsePdfText(file.FullName, onError);
return txt;
}
/// <summary>
/// Collects all of the data that needs to be indexed as defined in the index set.
/// </summary>
/// <param name="node">Media item XML being indexed</param>
/// <param name="type">Type of index (should only ever be media)</param>
/// <returns>Fields containing the data for the index</returns>
protected override Dictionary<string, string> GetDataToIndex(XElement node, string type)
{
var fields = base.GetDataToIndex(node, type);
//find the field which contains the file
var filePath = node.Elements().FirstOrDefault(x =>
{
if (x.Attribute("alias") != null)
return (string)x.Attribute("alias") == this.UmbracoFileProperty;
else
return x.Name == this.UmbracoFileProperty;
});
//make sure the file exists
if (filePath != default(XElement) && !string.IsNullOrEmpty((string)filePath))
{
//get the file path from the data service
var fullPath = this.DataService.MapPath((string)filePath);
var fi = new FileInfo(fullPath);
if (fi.Exists)
{
try
{
fields.Add(TextContentFieldName, ExtractTextFromFile(fi));
}
catch (NotSupportedException)
{
//log that we couldn't index the file found
DataService.LogService.AddErrorLog((int)node.Attribute("id"), "UmbracoExamine.FileIndexer: Extension '" + fi.Extension + "' is not supported at this time");
}
}
else
{
DataService.LogService.AddInfoLog((int)node.Attribute("id"), "UmbracoExamine.FileIndexer: No file found at path " + filePath);
}
}
return fields;
}
#region Internal PDFParser Class
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
internal class PDFParser
{
static PDFParser()
{
lock (m_Locker)
{
m_UnsupportedRange = new List<int>();
m_UnsupportedRange.AddRange(Enumerable.Range(0x0000, 0x001F));
m_UnsupportedRange.Add(0x1F);
}
}
private static readonly object m_Locker = new object();
/// <summary>
/// Stores the unsupported range of character
/// </summary>
/// <remarks>
/// used as a reference:
/// http://www.tamasoft.co.jp/en/general-info/unicode.html
/// http://en.wikipedia.org/wiki/Summary_of_Unicode_character_assignments
/// http://en.wikipedia.org/wiki/Unicode
/// http://en.wikipedia.org/wiki/Basic_Multilingual_Plane
/// </remarks>
private static List<int> m_UnsupportedRange;
/// <summary>
/// Return only the valid string contents of the PDF
/// </summary>
/// <param name="sourcePDF"></param>
/// <param name="onError"></param>
/// <returns></returns>
[SecuritySafeCritical]
public string ParsePdfText(string sourcePDF, Action<Exception> onError)
{
var sb = new StringBuilder();
var reader = new PdfReader(sourcePDF);
PRTokeniser token = null;
var tknValue = String.Empty;
for (var i = 1; (i <= reader.NumberOfPages); i++)
{
var pageBytes = reader.GetPageContent(i);
if (pageBytes != null)
{
token = new PRTokeniser(pageBytes);
try
{
while (token.NextToken())
{
var tknType = token.TokenType;
tknValue = token.StringValue;
if ((tknType == PRTokeniser.TokType.STRING))
{
foreach (var s in tknValue)
{
//strip out unsupported characters, based on unicode tables.
if (!m_UnsupportedRange.Contains(s))
{
sb.Append(s);
}
}
}
}
}
catch (InvalidPdfException ex)
{
onError(ex);
}
}
}
return sb.ToString();
}
}
#endregion
}
}

View File

@@ -0,0 +1,31 @@
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Security;
// General Information about an assembly is controlled through the following
// set of attributes. Change these attribute values to modify the information
// associated with an assembly.
[assembly: AssemblyCompany("umbraco")]
[assembly: AssemblyCopyright("Copyright © Umbraco 2012")]
[assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")]
[assembly: AssemblyTitle("UmbracoExamine.PDF")]
[assembly: AssemblyDescription("Umbraco index providers for PDF based on the Examine model using Lucene.NET 2.9.2")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyProduct("UmbracoExamine.PDF")]
// Setting ComVisible to false makes the types in this assembly not visible
// to COM components. If you need to access a type in this assembly from
// COM, set the ComVisible attribute to true on that type.
[assembly: ComVisible(false)]
// The following GUID is for the ID of the typelib if this project is exposed to COM
[assembly: Guid("8933a78d-8414-4c72-a74d-76aa7fb0e9ad")]
//NOTE: WE cannot make change the major version to be the same as Umbraco because of backwards compatibility, however we
// will make the minor version the same as the umbraco version
[assembly: AssemblyVersion("0.6.0.*")]
[assembly: AssemblyFileVersion("0.6.0.*")]
[assembly: AllowPartiallyTrustedCallers]

View File

@@ -0,0 +1,89 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProductVersion>8.0.30703</ProductVersion>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>{F30DDDB8-3994-4673-82AE-057123C6E1A8}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>UmbracoExamine.PDF</RootNamespace>
<AssemblyName>UmbracoExamine.PDF</AssemblyName>
<TargetFrameworkVersion>v4.0</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<SccProjectName>
</SccProjectName>
<SccLocalPath>
</SccLocalPath>
<SccAuxPath>
</SccAuxPath>
<SccProvider>
</SccProvider>
<TargetFrameworkProfile />
<SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\</SolutionDir>
<RestorePackages>true</RestorePackages>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<CodeAnalysisRuleSet>SecurityRules.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<DocumentationFile>bin\Release\UmbracoExamine.PDF.XML</DocumentationFile>
</PropertyGroup>
<ItemGroup>
<Reference Include="Examine">
<HintPath>..\packages\Examine.0.1.42.2941\lib\Examine.dll</HintPath>
</Reference>
<Reference Include="ICSharpCode.SharpZipLib">
<HintPath>..\packages\SharpZipLib.0.86.0\lib\20\ICSharpCode.SharpZipLib.dll</HintPath>
</Reference>
<Reference Include="itextsharp">
<HintPath>..\packages\iTextSharp.5.3.3\lib\itextsharp.dll</HintPath>
</Reference>
<Reference Include="Lucene.Net">
<HintPath>..\packages\Lucene.Net.2.9.4.1\lib\net40\Lucene.Net.dll</HintPath>
</Reference>
<Reference Include="System" />
<Reference Include="System.configuration" />
<Reference Include="System.Core" />
<Reference Include="System.Xml.Linq" />
<Reference Include="System.Data.DataSetExtensions" />
<Reference Include="System.Data" />
<Reference Include="System.Xml" />
</ItemGroup>
<ItemGroup>
<Compile Include="PDFIndexer.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\UmbracoExamine\UmbracoExamine.csproj">
<Project>{07fbc26b-2927-4a22-8d96-d644c667fecc}</Project>
<Name>UmbracoExamine</Name>
</ProjectReference>
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<Import Project="$(SolutionDir)\.nuget\nuget.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
</Project>

View File

@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="Examine" version="0.1.42.2941" targetFramework="net40" />
<package id="iTextSharp" version="5.3.3" targetFramework="net40" />
<package id="Lucene.Net" version="2.9.4.1" targetFramework="net40" />
<package id="SharpZipLib" version="0.86.0" targetFramework="net40" />
</packages>

View File

@@ -80,7 +80,8 @@
<AssemblyOriginatorKeyFile>..\Solution Items\TheFARM-Public.snk</AssemblyOriginatorKeyFile>
</PropertyGroup>
<ItemGroup>
<Reference Include="Examine">
<Reference Include="Examine, Version=0.1.43.2941, Culture=neutral, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\packages\Examine.0.1.42.2941\lib\Examine.dll</HintPath>
</Reference>
<Reference Include="ICSharpCode.SharpZipLib, Version=0.86.0.518, Culture=neutral, PublicKeyToken=1b03e6acf1164f73, processorArchitecture=MSIL">

View File

@@ -1,5 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="Examine" version="0.1.42.2941" targetFramework="net40" />
<package id="Lucene.Net" version="2.9.4.1" targetFramework="net40" />
<package id="SharpZipLib" version="0.86.0" targetFramework="net40" />
</packages>

View File

@@ -12,5 +12,6 @@
<repository path="..\Umbraco.Tests\packages.config" />
<repository path="..\Umbraco.Web.UI\packages.config" />
<repository path="..\Umbraco.Web\packages.config" />
<repository path="..\UmbracoExamine.PDF\packages.config" />
<repository path="..\UmbracoExamine\packages.config" />
</repositories>

View File

@@ -63,6 +63,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "UmbracoExamine", "UmbracoEx
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "UmbracoExamineLibs", "UmbracoExamineLibs", "{DD32977B-EF54-475B-9A1B-B97A502C6E58}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "UmbracoExamine.PDF", "UmbracoExamine.PDF\UmbracoExamine.PDF.csproj", "{F30DDDB8-3994-4673-82AE-057123C6E1A8}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -133,6 +135,10 @@ Global
{07FBC26B-2927-4A22-8D96-D644C667FECC}.Debug|Any CPU.Build.0 = Debug|Any CPU
{07FBC26B-2927-4A22-8D96-D644C667FECC}.Release|Any CPU.ActiveCfg = Release|Any CPU
{07FBC26B-2927-4A22-8D96-D644C667FECC}.Release|Any CPU.Build.0 = Release|Any CPU
{F30DDDB8-3994-4673-82AE-057123C6E1A8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{F30DDDB8-3994-4673-82AE-057123C6E1A8}.Debug|Any CPU.Build.0 = Debug|Any CPU
{F30DDDB8-3994-4673-82AE-057123C6E1A8}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F30DDDB8-3994-4673-82AE-057123C6E1A8}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@@ -140,5 +146,6 @@ Global
GlobalSection(NestedProjects) = preSolution
{5D3B8245-ADA6-453F-A008-50ED04BFE770} = {B5BD12C1-A454-435E-8A46-FF4A364C0382}
{07FBC26B-2927-4A22-8D96-D644C667FECC} = {DD32977B-EF54-475B-9A1B-B97A502C6E58}
{F30DDDB8-3994-4673-82AE-057123C6E1A8} = {DD32977B-EF54-475B-9A1B-B97A502C6E58}
EndGlobalSection
EndGlobal