As azure search was not an option, this is what we came up with. First we implemented an ISearchCrawlerContentProcessor to get rid of any unwanted content.
public class CustomContentProcessor : ISearchCrawlerContentProcessor
{
private readonly IEventLogService eventLogService;
public CustomContentProcessor(IEventLogService eventLogService)
{
this.eventLogService = eventLogService;
}
public string Process(string htmlContent)
{
try
{
//// Gets the body element from the HTML content, using the API of the AngleSharp library
var parser = new HtmlParser();
var doc = parser.ParseDocument(htmlContent);
// Removes elements marked with the default Xperience exclusion attribute
foreach (var element in doc.QuerySelectorAll($"*[{"data-ktc-search-exclude"}]"))
{
element.Remove();
}
htmlContent = doc.Body.InnerHtml;
// Removes new line entities
htmlContent = HTMLHelper.RegexHtmlToTextWhiteSpace.Replace(htmlContent, " ");
// Removes JavaScript
htmlContent = HTMLHelper.RegexHtmlToTextScript.Replace(htmlContent, " ");
// Removes Styles
htmlContent = HTMLHelper.RegexHtmlToTextStyle.Replace(htmlContent, " ");
// Removes tags
htmlContent = HTMLHelper.RegexHtmlToTextTags.Replace(htmlContent, " ");
// Decodes HTML entities
htmlContent = HTMLHelper.HTMLDecode(htmlContent);
return htmlContent;
}
catch (Exception ex)
{
eventLogService.LogException("CustomContentProcessor", "PROCESS", ex);
return string.Empty;
}
}
}
Next we created a document search event handler module to store this content into a custom page field.
public class CustomSmartSearchModule : Module
{
private const string PREVIEW_TEXT_COLUMN_NAME = "SearchResultPreviewText";
private const int PREVIEW_TEXT_MAX_LENGTH = 280;
public CustomSmartSearchModule() : base("CustomSmartSearch")
{
}
protected override void OnInit()
{
base.OnInit();
DocumentEvents.GetContent.Execute += GetContentOnExecute;
}
private void GetContentOnExecute(object sender, DocumentSearchEventArgs e)
{
var currentNode = e.Node;
if (!currentNode.ContainsColumn(PREVIEW_TEXT_COLUMN_NAME) || string.IsNullOrWhiteSpace(e.Content))
{
return;
}
var previewContent = e.Content.Trim().Substring(0, PREVIEW_TEXT_MAX_LENGTH);
currentNode.SetValue(PREVIEW_TEXT_COLUMN_NAME, previewContent);
currentNode.Update();
}
}
Finally we used this custom field as content source within the search config of every indexed page type.