using System; using System.Collections; using System.Collections.Generic; using System.Collections.Specialized; using System.IO; using System.Linq; using System.Security; using System.Text; using System.Web; using System.Xml.Linq; using Examine; using Examine.Config; using Examine.Providers; using Lucene.Net.Documents; using Lucene.Net.Index; using Umbraco.Core; using umbraco.cms.businesslogic; using Umbraco.Core.Models; using Umbraco.Core.Persistence.DatabaseModelDefinitions; using Umbraco.Core.Services; using UmbracoExamine.DataServices; using Examine.LuceneEngine; using Examine.LuceneEngine.Config; using UmbracoExamine.Config; using Examine.LuceneEngine.Providers; using Lucene.Net.Analysis; using umbraco.BasePages; using IContentService = Umbraco.Core.Services.IContentService; using UmbracoExamine.LocalStorage; using IMediaService = Umbraco.Core.Services.IMediaService; namespace UmbracoExamine { /// /// /// public class UmbracoContentIndexer : BaseUmbracoIndexer { private readonly IContentService _contentService; private readonly IMediaService _mediaService; private readonly IDataTypeService _dataTypeService; private readonly IUserService _userService; #region Constructors /// /// Default constructor /// public UmbracoContentIndexer() : base() { _contentService = ApplicationContext.Current.Services.ContentService; _mediaService = ApplicationContext.Current.Services.MediaService; _dataTypeService = ApplicationContext.Current.Services.DataTypeService; _userService = ApplicationContext.Current.Services.UserService; } /// /// Constructor to allow for creating an indexer at runtime /// /// /// /// /// /// [Obsolete("Use the overload that specifies the Umbraco services")] public UmbracoContentIndexer(IIndexCriteria indexerData, DirectoryInfo indexPath, IDataService dataService, Analyzer analyzer, bool async) : base(indexerData, indexPath, dataService, analyzer, async) { _contentService = ApplicationContext.Current.Services.ContentService; _mediaService = ApplicationContext.Current.Services.MediaService; _dataTypeService = ApplicationContext.Current.Services.DataTypeService; _userService = ApplicationContext.Current.Services.UserService; } /// /// Constructor to allow for creating an indexer at runtime /// /// /// /// /// /// [Obsolete("Use the overload that specifies the Umbraco services")] public UmbracoContentIndexer(IIndexCriteria indexerData, Lucene.Net.Store.Directory luceneDirectory, IDataService dataService, Analyzer analyzer, bool async) : base(indexerData, luceneDirectory, dataService, analyzer, async) { _contentService = ApplicationContext.Current.Services.ContentService; _mediaService = ApplicationContext.Current.Services.MediaService; _dataTypeService = ApplicationContext.Current.Services.DataTypeService; _userService = ApplicationContext.Current.Services.UserService; } /// /// Constructor to allow for creating an indexer at runtime /// /// /// /// /// /// /// /// /// /// public UmbracoContentIndexer(IIndexCriteria indexerData, Lucene.Net.Store.Directory luceneDirectory, IDataService dataService, IContentService contentService, IMediaService mediaService, IDataTypeService dataTypeService, IUserService userService, Analyzer analyzer, bool async) : base(indexerData, luceneDirectory, dataService, analyzer, async) { _contentService = contentService; _mediaService = mediaService; _dataTypeService = dataTypeService; _userService = userService; } #endregion #region Constants & Fields /// /// Used to store the path of a content object /// public const string IndexPathFieldName = "__Path"; public const string NodeTypeAliasFieldName = "__NodeTypeAlias"; public const string IconFieldName = "__Icon"; /// /// The prefix added to a field when it is duplicated in order to store the original raw value. /// public const string RawFieldPrefix = "__Raw_"; /// /// A type that defines the type of index for each Umbraco field (non user defined fields) /// Alot of standard umbraco fields shouldn't be tokenized or even indexed, just stored into lucene /// for retreival after searching. /// internal static readonly List IndexFieldPolicies = new List { new StaticField("id", FieldIndexTypes.NOT_ANALYZED, false, string.Empty), new StaticField("key", FieldIndexTypes.NOT_ANALYZED, false, string.Empty), new StaticField( "version", FieldIndexTypes.NOT_ANALYZED, false, string.Empty), new StaticField( "parentID", FieldIndexTypes.NOT_ANALYZED, false, string.Empty), new StaticField( "level", FieldIndexTypes.NOT_ANALYZED, true, "NUMBER"), new StaticField( "writerID", FieldIndexTypes.NOT_ANALYZED, false, string.Empty), new StaticField( "creatorID", FieldIndexTypes.NOT_ANALYZED, false, string.Empty), new StaticField( "nodeType", FieldIndexTypes.NOT_ANALYZED, false, string.Empty), new StaticField( "template", FieldIndexTypes.NOT_ANALYZED, false, string.Empty), new StaticField( "sortOrder", FieldIndexTypes.NOT_ANALYZED, true, "NUMBER"), new StaticField( "createDate", FieldIndexTypes.NOT_ANALYZED, false, "DATETIME"), new StaticField( "updateDate", FieldIndexTypes.NOT_ANALYZED, false, "DATETIME"), new StaticField( "nodeName", FieldIndexTypes.ANALYZED, false, string.Empty), new StaticField( "urlName", FieldIndexTypes.NOT_ANALYZED, false, string.Empty), new StaticField( "writerName", FieldIndexTypes.ANALYZED, false, string.Empty), new StaticField( "creatorName", FieldIndexTypes.ANALYZED, false, string.Empty), new StaticField( "nodeTypeAlias", FieldIndexTypes.ANALYZED, false, string.Empty), new StaticField( "path", FieldIndexTypes.NOT_ANALYZED, false, string.Empty) }; #endregion #region Initialize /// /// Set up all properties for the indexer based on configuration information specified. This will ensure that /// all of the folders required by the indexer are created and exist. This will also create an instruction /// file declaring the computer name that is part taking in the indexing. This file will then be used to /// determine the master indexer machine in a load balanced environment (if one exists). /// /// The friendly name of the provider. /// A collection of the name/value pairs representing the provider-specific attributes specified in the configuration for this provider. /// /// The name of the provider is null. /// /// /// The name of the provider has a length of zero. /// /// /// An attempt is made to call on a provider after the provider has already been initialized. /// public override void Initialize(string name, NameValueCollection config) { //check if there's a flag specifying to support unpublished content, //if not, set to false; bool supportUnpublished; if (config["supportUnpublished"] != null && bool.TryParse(config["supportUnpublished"], out supportUnpublished)) SupportUnpublishedContent = supportUnpublished; else SupportUnpublishedContent = false; //check if there's a flag specifying to support protected content, //if not, set to false; bool supportProtected; if (config["supportProtected"] != null && bool.TryParse(config["supportProtected"], out supportProtected)) SupportProtectedContent = supportProtected; else SupportProtectedContent = false; base.Initialize(name, config); } #endregion #region Properties /// /// By default this is false, if set to true then the indexer will include indexing content that is flagged as publicly protected. /// This property is ignored if SupportUnpublishedContent is set to true. /// public bool SupportProtectedContent { get; protected internal set; } protected override IEnumerable SupportedTypes { get { return new string[] { IndexTypes.Content, IndexTypes.Media }; } } #endregion #region Event handlers protected override void OnIndexingError(IndexingErrorEventArgs e) { DataService.LogService.AddErrorLog(e.NodeId, string.Format("{0},{1}, IndexSet: {2}", e.Message, e.InnerException != null ? e.InnerException.ToString() : "", this.IndexSetName)); base.OnIndexingError(e); } /// /// This ensures that the special __Raw_ fields are indexed /// /// protected override void OnDocumentWriting(DocumentWritingEventArgs docArgs) { var d = docArgs.Document; foreach (var f in docArgs.Fields.Where(x => x.Key.StartsWith(RawFieldPrefix))) { d.Add(new Field( f.Key, f.Value, Field.Store.YES, Field.Index.NO, //don't index this field, we never want to search by it Field.TermVector.NO)); } base.OnDocumentWriting(docArgs); } protected override void OnNodeIndexed(IndexedNodeEventArgs e) { DataService.LogService.AddVerboseLog(e.NodeId, string.Format("Index created for node {0}", e.NodeId)); base.OnNodeIndexed(e); } protected override void OnIndexDeleted(DeleteIndexEventArgs e) { DataService.LogService.AddVerboseLog(-1, string.Format("Index deleted for term: {0} with value {1}", e.DeletedTerm.Key, e.DeletedTerm.Value)); base.OnIndexDeleted(e); } protected override void OnIndexOptimizing(EventArgs e) { DataService.LogService.AddInfoLog(-1, string.Format("Index is being optimized")); base.OnIndexOptimizing(e); } #endregion #region Public methods /// /// Overridden for logging /// /// /// public override void ReIndexNode(XElement node, string type) { if (!SupportedTypes.Contains(type)) return; if (node.Attribute("id") != null) { DataService.LogService.AddVerboseLog((int)node.Attribute("id"), string.Format("ReIndexNode with type: {0}", type)); base.ReIndexNode(node, type); } else { DataService.LogService.AddErrorLog(-1, string.Format("ReIndexNode cannot proceed, the format of the XElement is invalid, the xml has no 'id' attribute. {0}", node)); } } /// /// Deletes a node from the index. /// /// /// When a content node is deleted, we also need to delete it's children from the index so we need to perform a /// custom Lucene search to find all decendents and create Delete item queues for them too. /// /// ID of the node to delete public override void DeleteFromIndex(string nodeId) { //find all descendants based on path var descendantPath = string.Format(@"\-1\,*{0}\,*", nodeId); var rawQuery = string.Format("{0}:{1}", IndexPathFieldName, descendantPath); var c = InternalSearcher.CreateSearchCriteria(); var filtered = c.RawQuery(rawQuery); var results = InternalSearcher.Search(filtered); DataService.LogService.AddVerboseLog(int.Parse(nodeId), string.Format("DeleteFromIndex with query: {0} (found {1} results)", rawQuery, results.Count())); //need to create a delete queue item for each one found foreach (var r in results) { EnqueueIndexOperation(new IndexOperation() { Operation = IndexOperationType.Delete, Item = new IndexItem(null, "", r.Id.ToString()) }); //SaveDeleteIndexQueueItem(new KeyValuePair(IndexNodeIdFieldName, r.Id.ToString())); } base.DeleteFromIndex(nodeId); } #endregion #region Protected protected override void PerformIndexAll(string type) { const int pageSize = 1000; var pageIndex = 0; switch (type) { case IndexTypes.Content: if (this.SupportUnpublishedContent == false) { //use the base implementation which will use the published XML cache to perform the lookups base.PerformIndexAll(type); } else { var contentParentId = -1; if (IndexerData.ParentNodeId.HasValue && IndexerData.ParentNodeId.Value > 0) { contentParentId = IndexerData.ParentNodeId.Value; } IContent[] content; do { long total; var descendants = _contentService.GetPagedDescendants(contentParentId, pageIndex, pageSize, out total); //if specific types are declared we need to post filter them //TODO: Update the service layer to join the cmsContentType table so we can query by content type too if (IndexerData.IncludeNodeTypes.Any()) { content = descendants.Where(x => IndexerData.IncludeNodeTypes.Contains(x.ContentType.Alias)).ToArray(); } else { content = descendants.ToArray(); } AddNodesToIndex(GetSerializedContent(content), type); pageIndex++; } while (content.Length == pageSize); } break; case IndexTypes.Media: var mediaParentId = -1; if (IndexerData.ParentNodeId.HasValue && IndexerData.ParentNodeId.Value > 0) { mediaParentId = IndexerData.ParentNodeId.Value; } IMedia[] media; do { long total; var descendants = _mediaService.GetPagedDescendants(mediaParentId, pageIndex, pageSize, out total); //if specific types are declared we need to post filter them //TODO: Update the service layer to join the cmsContentType table so we can query by content type too if (IndexerData.IncludeNodeTypes.Any()) { media = descendants.Where(x => IndexerData.IncludeNodeTypes.Contains(x.ContentType.Alias)).ToArray(); } else { media = descendants.ToArray(); } AddNodesToIndex(GetSerializedMedia(media), type); pageIndex++; } while (media.Length == pageSize); break; } } private IEnumerable GetSerializedMedia(IEnumerable media) { var serializer = new EntityXmlSerializer(); foreach (var m in media) { var xml = serializer.Serialize( _mediaService, _dataTypeService, _userService, m); //add a custom 'icon' attribute if (m.ContentType.Icon.IsNullOrWhiteSpace() == false) { xml.Add(new XAttribute("icon", m.ContentType.Icon)); } yield return xml; } } private IEnumerable GetSerializedContent(IEnumerable content) { var serializer = new EntityXmlSerializer(); foreach (var c in content) { var xml = serializer.Serialize( _contentService, _dataTypeService, _userService, c); //add a custom 'icon' attribute xml.Add(new XAttribute("icon", c.ContentType.Icon)); yield return xml; } } /// /// Overridden for logging. /// /// /// protected override void AddSingleNodeToIndex(XElement node, string type) { DataService.LogService.AddVerboseLog((int)node.Attribute("id"), string.Format("AddSingleNodeToIndex with type: {0}", type)); base.AddSingleNodeToIndex(node, type); } public override void RebuildIndex() { DataService.LogService.AddVerboseLog(-1, "Rebuilding index"); base.RebuildIndex(); } /// /// Used to refresh the current IndexerData from the data in the DataService. This can be used /// if there are more properties added/removed from the database /// public void RefreshIndexerDataFromDataService() { //TODO: This would be much better done if the IndexerData property had read/write locks applied // to it! Unless we update the base class there's really no way to prevent the IndexerData from being // changed during an operation that is reading from it. var newIndexerData = GetIndexerData(IndexSets.Instance.Sets[IndexSetName]); IndexerData = newIndexerData; } /// /// Override this method to strip all html from all user fields before raising the event, then after the event /// ensure our special Path field is added to the collection /// /// protected override void OnGatheringNodeData(IndexingNodeDataEventArgs e) { //strip html of all users fields if we detect it has HTML in it. //if that is the case, we'll create a duplicate 'raw' copy of it so that we can return //the value of the field 'as-is'. // Get all user data that we want to index and store into a dictionary foreach (var field in IndexerData.UserFields) { if (e.Fields.ContainsKey(field.Name)) { //check if the field value has html if (XmlHelper.CouldItBeXml(e.Fields[field.Name])) { //First save the raw value to a raw field, we will change the policy of this field by detecting the prefix later e.Fields[RawFieldPrefix + field.Name] = e.Fields[field.Name]; //now replace the original value with the stripped html e.Fields[field.Name] = DataService.ContentService.StripHtml(e.Fields[field.Name]); } } } base.OnGatheringNodeData(e); //ensure the special path and node type alias fields is added to the dictionary to be saved to file var path = e.Node.Attribute("path").Value; if (!e.Fields.ContainsKey(IndexPathFieldName)) e.Fields.Add(IndexPathFieldName, path); //this needs to support both schema's so get the nodeTypeAlias if it exists, otherwise the name var nodeTypeAlias = e.Node.Attribute("nodeTypeAlias") == null ? e.Node.Name.LocalName : e.Node.Attribute("nodeTypeAlias").Value; if (!e.Fields.ContainsKey(NodeTypeAliasFieldName)) e.Fields.Add(NodeTypeAliasFieldName, nodeTypeAlias); //add icon var icon = (string)e.Node.Attribute("icon"); if (!e.Fields.ContainsKey(IconFieldName)) e.Fields.Add(IconFieldName, icon); } /// /// Called when a duplicate field is detected in the dictionary that is getting indexed. /// /// /// /// protected override void OnDuplicateFieldWarning(int nodeId, string indexSetName, string fieldName) { base.OnDuplicateFieldWarning(nodeId, indexSetName, fieldName); this.DataService.LogService.AddInfoLog(nodeId, "Field \"" + fieldName + "\" is listed multiple times in the index set \"" + indexSetName + "\". Please ensure all names are unique"); } /// /// Overridden to add the path property to the special fields to index /// /// /// protected override Dictionary GetSpecialFieldsToIndex(Dictionary allValuesForIndexing) { var fields = base.GetSpecialFieldsToIndex(allValuesForIndexing); //adds the special path property to the index fields.Add(IndexPathFieldName, allValuesForIndexing[IndexPathFieldName]); //adds the special node type alias property to the index fields.Add(NodeTypeAliasFieldName, allValuesForIndexing[NodeTypeAliasFieldName]); //icon if (allValuesForIndexing[IconFieldName].IsNullOrWhiteSpace() == false) { fields.Add(IconFieldName, allValuesForIndexing[IconFieldName]); } return fields; } /// /// Creates an IIndexCriteria object based on the indexSet passed in and our DataService /// /// /// /// /// If we cannot initialize we will pass back empty indexer data since we cannot read from the database /// protected override IIndexCriteria GetIndexerData(IndexSet indexSet) { if (CanInitialize()) { return indexSet.ToIndexCriteria(DataService, IndexFieldPolicies); } else { return base.GetIndexerData(indexSet); } } /// /// return the index policy for the field name passed in, if not found, return normal /// /// /// protected override FieldIndexTypes GetPolicy(string fieldName) { var def = IndexFieldPolicies.Where(x => x.Name == fieldName).ToArray(); return (def.Any() == false ? FieldIndexTypes.ANALYZED : def.Single().IndexType); } /// /// Ensure that the content of this node is available for indexing (i.e. don't allow protected /// content to be indexed when this is disabled). /// /// protected override bool ValidateDocument(XElement node) { var nodeId = int.Parse(node.Attribute("id").Value); // Test for access if we're only indexing published content // return nothing if we're not supporting protected content and it is protected, and we're not supporting unpublished content if (!SupportUnpublishedContent && (!SupportProtectedContent && DataService.ContentService.IsProtected(nodeId, node.Attribute("path").Value))) { return false; } return base.ValidateDocument(node); } #endregion } }