diff --git a/src/Umbraco.Core/StringExtensions.cs b/src/Umbraco.Core/StringExtensions.cs index 33a5730444..df7c0d07b0 100644 --- a/src/Umbraco.Core/StringExtensions.cs +++ b/src/Umbraco.Core/StringExtensions.cs @@ -28,6 +28,16 @@ namespace Umbraco.Core [UmbracoWillObsolete("Do not use this constants. See IShortStringHelper.CleanStringForSafeAliasJavaScriptCode.")] public const string UmbracoInvalidFirstCharacters = "01234567890"; + public static string ExceptChars(this string str, HashSet toExclude) + { + var sb = new StringBuilder(str.Length); + foreach (var c in str.Where(c => toExclude.Contains(c) == false)) + { + sb.Append(c); + } + return sb.ToString(); + } + /// /// Encrypt the string using the MachineKey in medium trust /// diff --git a/src/UmbracoExamine.PDF/PDFIndexer.cs b/src/UmbracoExamine.PDF/PDFIndexer.cs index 8f8237cbbb..475a75c3ce 100644 --- a/src/UmbracoExamine.PDF/PDFIndexer.cs +++ b/src/UmbracoExamine.PDF/PDFIndexer.cs @@ -12,6 +12,7 @@ using iTextSharp.text.pdf; using System.Text; using Lucene.Net.Analysis; using UmbracoExamine.DataServices; +using iTextSharp.text.pdf.parser; namespace UmbracoExamine.PDF @@ -105,6 +106,7 @@ namespace UmbracoExamine.PDF /// /// /// + [SecuritySafeCritical] public override void Initialize(string name, NameValueCollection config) { base.Initialize(name, config); @@ -133,7 +135,7 @@ namespace UmbracoExamine.PDF Action onError = (e) => OnIndexingError(new IndexingErrorEventArgs("Could not read PDF", -1, e)); - var txt = pdf.ParsePdfText(file.FullName, onError); + var txt = pdf.GetTextFromAllPages(file.FullName, onError); return txt; } @@ -193,16 +195,21 @@ namespace UmbracoExamine.PDF static PDFParser() { - lock (m_Locker) + lock (Locker) { - m_UnsupportedRange = new List(); - m_UnsupportedRange.AddRange(Enumerable.Range(0x0000, 0x001F)); - m_UnsupportedRange.Add(0x1F); + UnsupportedRange = new HashSet(); + foreach (var c in Enumerable.Range(0x0000, 0x001F)) + { + UnsupportedRange.Add((char) c); + } + UnsupportedRange.Add((char)0x1F); + //replace line breaks with space + ReplaceWithSpace = new HashSet {'\r', '\n'}; } } - private static readonly object m_Locker = new object(); + private static readonly object Locker = new object(); /// /// Stores the unsupported range of character @@ -214,61 +221,68 @@ namespace UmbracoExamine.PDF /// http://en.wikipedia.org/wiki/Unicode /// http://en.wikipedia.org/wiki/Basic_Multilingual_Plane /// - private static List m_UnsupportedRange; + private static HashSet UnsupportedRange; - /// - /// Return only the valid string contents of the PDF - /// - /// - /// - /// - [SecuritySafeCritical] - public string ParsePdfText(string sourcePDF, Action onError) + private static HashSet ReplaceWithSpace; + + [SecuritySafeCritical] + public string GetTextFromAllPages(string pdfPath, Action onError) { - var sb = new StringBuilder(); + var output = new StringWriter(); - var reader = new PdfReader(sourcePDF); - PRTokeniser token = null; - var tknValue = String.Empty; - - for (var i = 1; (i <= reader.NumberOfPages); i++) + try { - var pageBytes = reader.GetPageContent(i); - if (pageBytes != null) - { - token = new PRTokeniser(pageBytes); - try - { - while (token.NextToken()) - { - var tknType = token.TokenType; - tknValue = token.StringValue; - if ((tknType == PRTokeniser.TokType.STRING)) - { - foreach (var s in tknValue) - { - //strip out unsupported characters, based on unicode tables. - if (!m_UnsupportedRange.Contains(s)) - { - sb.Append(s); - } - } + var reader = new PdfReader(pdfPath); - } - } - } - catch (InvalidPdfException ex) - { - onError(ex); - } + for (int i = 1; i <= reader.NumberOfPages; i++) + { + var result = + ExceptChars( + PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy()), + UnsupportedRange, + ReplaceWithSpace); + output.Write(result); } } + catch (Exception ex) + { + onError(ex); + } - return sb.ToString(); + return output.ToString(); } } + + /// + /// remove all toExclude chars from string + /// + /// + /// + /// + /// + private static string ExceptChars(string str, HashSet toExclude, HashSet replaceWithSpace) + { + var sb = new StringBuilder(str.Length); + for (var i = 0; i < str.Length; i++) + { + var c = str[i]; + if (toExclude.Contains(c) == false) + { + if (replaceWithSpace.Contains(c)) + { + sb.Append(" "); + } + else + { + sb.Append(c); + } + } + + } + return sb.ToString(); + } #endregion } diff --git a/src/UmbracoExamine/BaseUmbracoIndexer.cs b/src/UmbracoExamine/BaseUmbracoIndexer.cs index 217eac73ec..8ccef8b22e 100644 --- a/src/UmbracoExamine/BaseUmbracoIndexer.cs +++ b/src/UmbracoExamine/BaseUmbracoIndexer.cs @@ -96,6 +96,7 @@ namespace UmbracoExamine /// /// /// + [SecuritySafeCritical] public override void Initialize(string name, System.Collections.Specialized.NameValueCollection config) { if (config["dataService"] != null && !string.IsNullOrEmpty(config["dataService"])) diff --git a/src/UmbracoExamine/UmbracoContentIndexer.cs b/src/UmbracoExamine/UmbracoContentIndexer.cs index 81a9dbe41b..9da53a77f2 100644 --- a/src/UmbracoExamine/UmbracoContentIndexer.cs +++ b/src/UmbracoExamine/UmbracoContentIndexer.cs @@ -124,6 +124,7 @@ namespace UmbracoExamine /// /// An attempt is made to call on a provider after the provider has already been initialized. /// + [SecuritySafeCritical] public override void Initialize(string name, System.Collections.Specialized.NameValueCollection config) { diff --git a/src/UmbracoExamine/UmbracoExamineSearcher.cs b/src/UmbracoExamine/UmbracoExamineSearcher.cs index 68f07f45e1..7406e98f8e 100644 --- a/src/UmbracoExamine/UmbracoExamineSearcher.cs +++ b/src/UmbracoExamine/UmbracoExamineSearcher.cs @@ -45,6 +45,7 @@ namespace UmbracoExamine } } + [SecuritySafeCritical] public override void Initialize(string name, System.Collections.Specialized.NameValueCollection config) { if (name == null) throw new ArgumentNullException("name");