Fixes U4-2618 PDF indexer does not ensure spaces are added to textual content and fixes med trust compat relating to U4-2180 Problem with Examine MultiIndexSearcher after upgrading from 4.11.3 to 4.11.8

This commit is contained in:
Shannon
2013-08-13 13:47:57 +10:00
parent c52c452b36
commit aa1c411c8c
5 changed files with 76 additions and 49 deletions

View File

@@ -28,6 +28,16 @@ namespace Umbraco.Core
[UmbracoWillObsolete("Do not use this constants. See IShortStringHelper.CleanStringForSafeAliasJavaScriptCode.")]
public const string UmbracoInvalidFirstCharacters = "01234567890";
public static string ExceptChars(this string str, HashSet<char> toExclude)
{
var sb = new StringBuilder(str.Length);
foreach (var c in str.Where(c => toExclude.Contains(c) == false))
{
sb.Append(c);
}
return sb.ToString();
}
/// <summary>
/// Encrypt the string using the MachineKey in medium trust
/// </summary>

View File

@@ -12,6 +12,7 @@ using iTextSharp.text.pdf;
using System.Text;
using Lucene.Net.Analysis;
using UmbracoExamine.DataServices;
using iTextSharp.text.pdf.parser;
namespace UmbracoExamine.PDF
@@ -105,6 +106,7 @@ namespace UmbracoExamine.PDF
/// </summary>
/// <param name="name"></param>
/// <param name="config"></param>
[SecuritySafeCritical]
public override void Initialize(string name, NameValueCollection config)
{
base.Initialize(name, config);
@@ -133,7 +135,7 @@ namespace UmbracoExamine.PDF
Action<Exception> onError = (e) => OnIndexingError(new IndexingErrorEventArgs("Could not read PDF", -1, e));
var txt = pdf.ParsePdfText(file.FullName, onError);
var txt = pdf.GetTextFromAllPages(file.FullName, onError);
return txt;
}
@@ -193,16 +195,21 @@ namespace UmbracoExamine.PDF
static PDFParser()
{
lock (m_Locker)
lock (Locker)
{
m_UnsupportedRange = new List<int>();
m_UnsupportedRange.AddRange(Enumerable.Range(0x0000, 0x001F));
m_UnsupportedRange.Add(0x1F);
UnsupportedRange = new HashSet<char>();
foreach (var c in Enumerable.Range(0x0000, 0x001F))
{
UnsupportedRange.Add((char) c);
}
UnsupportedRange.Add((char)0x1F);
//replace line breaks with space
ReplaceWithSpace = new HashSet<char> {'\r', '\n'};
}
}
private static readonly object m_Locker = new object();
private static readonly object Locker = new object();
/// <summary>
/// Stores the unsupported range of character
@@ -214,61 +221,68 @@ namespace UmbracoExamine.PDF
/// http://en.wikipedia.org/wiki/Unicode
/// http://en.wikipedia.org/wiki/Basic_Multilingual_Plane
/// </remarks>
private static List<int> m_UnsupportedRange;
private static HashSet<char> UnsupportedRange;
/// <summary>
/// Return only the valid string contents of the PDF
/// </summary>
/// <param name="sourcePDF"></param>
/// <param name="onError"></param>
/// <returns></returns>
[SecuritySafeCritical]
public string ParsePdfText(string sourcePDF, Action<Exception> onError)
private static HashSet<char> ReplaceWithSpace;
[SecuritySafeCritical]
public string GetTextFromAllPages(string pdfPath, Action<Exception> onError)
{
var sb = new StringBuilder();
var output = new StringWriter();
var reader = new PdfReader(sourcePDF);
PRTokeniser token = null;
var tknValue = String.Empty;
for (var i = 1; (i <= reader.NumberOfPages); i++)
try
{
var pageBytes = reader.GetPageContent(i);
if (pageBytes != null)
{
token = new PRTokeniser(pageBytes);
try
{
while (token.NextToken())
{
var tknType = token.TokenType;
tknValue = token.StringValue;
if ((tknType == PRTokeniser.TokType.STRING))
{
foreach (var s in tknValue)
{
//strip out unsupported characters, based on unicode tables.
if (!m_UnsupportedRange.Contains(s))
{
sb.Append(s);
}
}
var reader = new PdfReader(pdfPath);
}
}
}
catch (InvalidPdfException ex)
{
onError(ex);
}
for (int i = 1; i <= reader.NumberOfPages; i++)
{
var result =
ExceptChars(
PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy()),
UnsupportedRange,
ReplaceWithSpace);
output.Write(result);
}
}
catch (Exception ex)
{
onError(ex);
}
return sb.ToString();
return output.ToString();
}
}
/// <summary>
/// remove all toExclude chars from string
/// </summary>
/// <param name="str"></param>
/// <param name="toExclude"></param>
/// <param name="replaceWithSpace"></param>
/// <returns></returns>
private static string ExceptChars(string str, HashSet<char> toExclude, HashSet<char> replaceWithSpace)
{
var sb = new StringBuilder(str.Length);
for (var i = 0; i < str.Length; i++)
{
var c = str[i];
if (toExclude.Contains(c) == false)
{
if (replaceWithSpace.Contains(c))
{
sb.Append(" ");
}
else
{
sb.Append(c);
}
}
}
return sb.ToString();
}
#endregion
}

View File

@@ -96,6 +96,7 @@ namespace UmbracoExamine
/// </summary>
/// <param name="name"></param>
/// <param name="config"></param>
[SecuritySafeCritical]
public override void Initialize(string name, System.Collections.Specialized.NameValueCollection config)
{
if (config["dataService"] != null && !string.IsNullOrEmpty(config["dataService"]))

View File

@@ -124,6 +124,7 @@ namespace UmbracoExamine
/// <exception cref="T:System.InvalidOperationException">
/// An attempt is made to call <see cref="M:System.Configuration.Provider.ProviderBase.Initialize(System.String,System.Collections.Specialized.NameValueCollection)"/> on a provider after the provider has already been initialized.
/// </exception>
[SecuritySafeCritical]
public override void Initialize(string name, System.Collections.Specialized.NameValueCollection config)
{

View File

@@ -45,6 +45,7 @@ namespace UmbracoExamine
}
}
[SecuritySafeCritical]
public override void Initialize(string name, System.Collections.Specialized.NameValueCollection config)
{
if (name == null) throw new ArgumentNullException("name");