diff --git a/BotSharp.sln b/BotSharp.sln index e992d26ad..102137084 100644 --- a/BotSharp.sln +++ b/BotSharp.sln @@ -149,6 +149,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Plugin.ExcelHandle EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Plugin.ImageHandler", "src\Plugins\BotSharp.Plugin.ImageHandler\BotSharp.Plugin.ImageHandler.csproj", "{242F2D93-FCCE-4982-8075-F3052ECCA92C}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Plugin.FuzzySharp", "src\Plugins\BotSharp.Plugin.FuzzySharp\BotSharp.Plugin.FuzzySharp.csproj", "{E7C243B9-E751-B3B4-8F16-95C76CA90D31}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -629,6 +631,14 @@ Global {242F2D93-FCCE-4982-8075-F3052ECCA92C}.Release|Any CPU.Build.0 = Release|Any CPU {242F2D93-FCCE-4982-8075-F3052ECCA92C}.Release|x64.ActiveCfg = Release|Any CPU {242F2D93-FCCE-4982-8075-F3052ECCA92C}.Release|x64.Build.0 = Release|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|Any CPU.Build.0 = Debug|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|x64.ActiveCfg = Debug|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|x64.Build.0 = Debug|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|Any CPU.ActiveCfg = Release|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|Any CPU.Build.0 = Release|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|x64.ActiveCfg = Release|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|x64.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -701,6 +711,7 @@ Global {0428DEAA-E4FE-4259-A6D8-6EDD1A9D0702} = {51AFE054-AE99-497D-A593-69BAEFB5106F} {FC63C875-E880-D8BB-B8B5-978AB7B62983} = {51AFE054-AE99-497D-A593-69BAEFB5106F} {242F2D93-FCCE-4982-8075-F3052ECCA92C} = {51AFE054-AE99-497D-A593-69BAEFB5106F} + {E7C243B9-E751-B3B4-8F16-95C76CA90D31} = {51AFE054-AE99-497D-A593-69BAEFB5106F} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {A9969D89-C98B-40A5-A12B-FC87E55B3A19} diff --git a/Directory.Packages.props b/Directory.Packages.props index c7ef3907b..53e2bb5be 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -5,6 +5,8 @@ true + + @@ -18,6 +20,7 @@ + diff --git a/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPhraseCollection.cs b/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPhraseCollection.cs new file mode 100644 index 000000000..9238e3220 --- /dev/null +++ b/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPhraseCollection.cs @@ -0,0 +1,7 @@ +namespace BotSharp.Abstraction.Knowledges; + +public interface IPhraseCollection +{ + Task>> LoadVocabularyAsync(); + Task> LoadSynonymMappingAsync(); +} diff --git a/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPhraseService.cs b/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPhraseService.cs new file mode 100644 index 000000000..1ca84024a --- /dev/null +++ b/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPhraseService.cs @@ -0,0 +1,6 @@ +namespace BotSharp.Abstraction.Knowledges; + +public interface IPhraseService +{ + Task> SearchPhrasesAsync(string term); +} \ No newline at end of file diff --git a/src/Infrastructure/BotSharp.Abstraction/Knowledges/Models/SearchPhrasesResult.cs b/src/Infrastructure/BotSharp.Abstraction/Knowledges/Models/SearchPhrasesResult.cs new file mode 100644 index 000000000..64dc0c18f --- /dev/null +++ b/src/Infrastructure/BotSharp.Abstraction/Knowledges/Models/SearchPhrasesResult.cs @@ -0,0 +1,11 @@ + +namespace BotSharp.Abstraction.Knowledges.Models; + +public class SearchPhrasesResult +{ + public string Token { get; set; } = string.Empty; + public List Sources { get; set; } = new(); + public string CanonicalForm { get; set; } = string.Empty; + public string MatchType { get; set; } = string.Empty; + public double Confidence { get; set; } +} \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/BotSharp.Plugin.FuzzySharp.csproj b/src/Plugins/BotSharp.Plugin.FuzzySharp/BotSharp.Plugin.FuzzySharp.csproj new file mode 100644 index 000000000..8561dc204 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/BotSharp.Plugin.FuzzySharp.csproj @@ -0,0 +1,21 @@ + + + + $(TargetFramework) + enable + $(LangVersion) + $(BotSharpVersion) + $(GeneratePackageOnBuild) + $(GenerateDocumentationFile) + $(SolutionDir)packages + + + + + + + + + + + \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Constants/MatchReason.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Constants/MatchReason.cs new file mode 100644 index 000000000..f46b3abf7 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Constants/MatchReason.cs @@ -0,0 +1,20 @@ + +namespace BotSharp.Plugin.FuzzySharp.Constants; + +public static class MatchReason +{ + ///

+ /// Token matched a synonym term (e.g., HVAC -> Air Conditioning/Heating) + ///

+ public const string SynonymMatch = "synonym_match"; + + ///

+ /// Token exactly matched a vocabulary entry + ///

+ public const string ExactMatch = "exact_match"; + + ///

+ /// Token was flagged as a potential typo and a correction was suggested + ///

+ public const string TypoCorrection = "typo_correction"; +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Constants/TextConstants.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Constants/TextConstants.cs new file mode 100644 index 000000000..a8c749d13 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Constants/TextConstants.cs @@ -0,0 +1,29 @@ + +namespace BotSharp.Plugin.FuzzySharp.Constants; + +public static class TextConstants +{ + ///

+ /// Characters that need to be separated during tokenization (by adding spaces before and after) + /// Includes: parentheses, brackets, braces, punctuation marks, special symbols, etc. + /// This ensures "(IH)" is split into "(", "IH", ")" + ///

+ public static readonly char[] SeparatorChars = + { + // Parentheses and brackets + '(', ')', '[', ']', '{', '}', + // Punctuation marks + ',', '.', ';', ':', '!', '?', + // Special symbols + '=', '@', '#', '$', '%', '^', '&', '*', '+', '-', '\\', '|', '<', '>', '~', '`' + }; + + ///

+ /// Whitespace characters used as token separators during tokenization. + /// Includes: space, tab, newline, and carriage return. + ///

+ public static readonly char[] TokenSeparators = + { + ' ', '\t', '\n', '\r' + }; +} \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Controllers/FuzzySharpController.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Controllers/FuzzySharpController.cs new file mode 100644 index 000000000..bd1288b92 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Controllers/FuzzySharpController.cs @@ -0,0 +1,59 @@ +using BotSharp.Abstraction.Knowledges; +using BotSharp.Abstraction.Knowledges.Models; +using Microsoft.AspNetCore.Http; +using Microsoft.AspNetCore.Mvc; +using Microsoft.Extensions.Logging; + +namespace BotSharp.Plugin.FuzzySharp.Controllers; + +[ApiController] +public class FuzzySharpController : ControllerBase +{ + private readonly IPhraseService _phraseService; + private readonly ILogger _logger; + + public FuzzySharpController( + IPhraseService phraseService, + ILogger logger) + { + _phraseService = phraseService; + _logger = logger; + } + + ///

+ /// Analyze text for typos and entities using vocabulary. + /// + /// Returns: + /// - `original`: Original input text + /// - `tokens`: Tokenized text (only included if `include_tokens=true`) + /// - `flagged`: List of flagged items (each with `match_type`): + /// - `synonym_match` - Business abbreviations (confidence=1.0) + /// - `exact_match` - Exact vocabulary matches (confidence=1.0) + /// - `typo_correction` - Spelling corrections (confidence less than 1.0) + /// - `processing_time_ms`: Processing time in milliseconds + ///

+ /// Text analysis request + /// Text analysis response + [HttpPost("fuzzy-sharp/analyze-text")] + [ProducesResponseType(typeof(List), StatusCodes.Status200OK)] + [ProducesResponseType(StatusCodes.Status400BadRequest)] + [ProducesResponseType(StatusCodes.Status500InternalServerError)] + public async Task AnalyzeText([FromBody] string text) + { + try + { + if (string.IsNullOrWhiteSpace(text)) + { + return BadRequest(new { error = "Text is required" }); + } + + var result = await _phraseService.SearchPhrasesAsync(text); + return Ok(result); + } + catch (Exception ex) + { + _logger.LogError(ex, "Error analyzing and searching entities"); + return StatusCode(500, new { error = $"Error analyzing and searching entities: {ex.Message}" }); + } + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Arguments/TextAnalysisRequest.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Arguments/TextAnalysisRequest.cs new file mode 100644 index 000000000..92bfad905 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Arguments/TextAnalysisRequest.cs @@ -0,0 +1,13 @@ + +namespace BotSharp.Plugin.FuzzySharp.FuzzSharp.Arguments; + +public class TextAnalysisRequest +{ + public string Text { get; set; } = string.Empty; + public string? VocabularyFolderName { get; set; } + public string? SynonymMappingFile { get; set; } + public double Cutoff { get; set; } = 0.82; + public int TopK { get; set; } = 5; + public int MaxNgram { get; set; } = 5; + public bool IncludeTokens { get; set; } = false; +} \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/INgramProcessor.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/INgramProcessor.cs new file mode 100644 index 000000000..90a9a06f1 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/INgramProcessor.cs @@ -0,0 +1,26 @@ +using BotSharp.Plugin.FuzzySharp.FuzzSharp.Models; + +namespace BotSharp.Plugin.FuzzySharp.FuzzSharp; + +public interface INgramProcessor +{ + ///

+ /// Process tokens and generate all possible n-gram match results + ///

+ /// List of tokens to process + /// Vocabulary (source -> vocabulary set) + /// Synonym term Mapping + /// Lookup table (lowercase vocabulary -> (canonical form, source list)) + /// Maximum n-gram length + /// Minimum confidence threshold for fuzzy matching + /// Maximum number of matches to return + /// List of flagged items + List ProcessNgrams( + List tokens, + Dictionary> vocabulary, + Dictionary synonymMapping, + Dictionary Sources)> lookup, + int maxNgram, + double cutoff, + int topK); +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/IResultProcessor.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/IResultProcessor.cs new file mode 100644 index 000000000..c900877bf --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/IResultProcessor.cs @@ -0,0 +1,17 @@ +using BotSharp.Plugin.FuzzySharp.FuzzSharp.Models; + +namespace BotSharp.Plugin.FuzzySharp.FuzzSharp; + +///

+/// Result processor interface +/// Responsible for processing match results, including deduplication and sorting +///

+public interface IResultProcessor +{ + ///

+ /// Process a list of flagged items, removing overlapping duplicates and sorting + ///

+ /// List of flagged items to process + /// Processed list of flagged items (deduplicated and sorted) + List ProcessResults(List flagged); +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/ITokenMatcher.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/ITokenMatcher.cs new file mode 100644 index 000000000..c715a8255 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/ITokenMatcher.cs @@ -0,0 +1,39 @@ +namespace BotSharp.Plugin.FuzzySharp.FuzzSharp; + +public interface ITokenMatcher +{ + ///

+ /// Try to match a content span and return a match result + ///

+ /// The matching context containing all necessary information + /// Match result if found, null otherwise + MatchResult? TryMatch(MatchContext context); + + ///

+ /// Priority of this matcher (higher priority matchers are tried first) + ///

+ int Priority { get; } +} + +///

+/// Context information for token matching +///

+public record MatchContext( + string ContentSpan, + string ContentLow, + int StartIndex, + int NgramLength, + Dictionary> Vocabulary, + Dictionary SynonymMapping, + Dictionary Sources)> Lookup, + double Cutoff, + int TopK); + +///

+/// Result of a token match +///

+public record MatchResult( + string CanonicalForm, + List Sources, + string MatchType, + double Confidence); diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Models/FlaggedItem.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Models/FlaggedItem.cs new file mode 100644 index 000000000..67bbd2802 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Models/FlaggedItem.cs @@ -0,0 +1,13 @@ + +namespace BotSharp.Plugin.FuzzySharp.FuzzSharp.Models; + +public class FlaggedItem +{ + public int Index { get; set; } + public string Token { get; set; } = string.Empty; + public List Sources { get; set; } = new(); + public string MatchType { get; set; } = string.Empty; + public string CanonicalForm { get; set; } = string.Empty; + public double Confidence { get; set; } + public int NgramLength { get; set; } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Models/TextAnalysisResponse.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Models/TextAnalysisResponse.cs new file mode 100644 index 000000000..0a05d9cd1 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Models/TextAnalysisResponse.cs @@ -0,0 +1,10 @@ + +namespace BotSharp.Plugin.FuzzySharp.FuzzSharp.Models; + +public class TextAnalysisResponse +{ + public string Original { get; set; } = string.Empty; + public List? Tokens { get; set; } + public List Flagged { get; set; } = new(); + public double ProcessingTimeMs { get; set; } +} \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzySharpPlugin.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzySharpPlugin.cs new file mode 100644 index 000000000..1a125ea08 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzySharpPlugin.cs @@ -0,0 +1,29 @@ +using BotSharp.Plugin.FuzzySharp.FuzzSharp; +using BotSharp.Abstraction.Knowledges; +using BotSharp.Abstraction.Plugins; +using BotSharp.Plugin.FuzzySharp.Services; +using BotSharp.Plugin.FuzzySharp.Services.Matching; +using BotSharp.Plugin.FuzzySharp.Services.Processors; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; + +namespace BotSharp.Plugin.FuzzySharp; + +public class FuzzySharpPlugin : IBotSharpPlugin +{ + public string Id => "379e6f7b-c58c-458b-b8cd-0374e5830711"; + public string Name => "Fuzzy Sharp"; + public string Description => "Analyze text for typos and entities using domain-specific vocabulary."; + public string IconUrl => "https://cdn-icons-png.flaticon.com/512/9592/9592995.png"; + + public void RegisterDI(IServiceCollection services, IConfiguration config) + { + services.AddScoped(); + services.AddScoped(); + services.AddScoped(); + services.AddScoped(); + services.AddScoped(); + services.AddScoped(); + services.AddScoped(); + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/CsvPhraseCollectionLoader.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/CsvPhraseCollectionLoader.cs new file mode 100644 index 000000000..af471bfee --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/CsvPhraseCollectionLoader.cs @@ -0,0 +1,187 @@ +using BotSharp.Abstraction.Knowledges; +using BotSharp.Core.Infrastructures; +using CsvHelper; +using CsvHelper.Configuration; +using Microsoft.Extensions.Logging; +using System.Globalization; +using System.IO; + +namespace BotSharp.Plugin.FuzzySharp.Services; + +public class CsvPhraseCollectionLoader : IPhraseCollection +{ + private readonly ILogger _logger; + + public CsvPhraseCollectionLoader(ILogger logger) + { + _logger = logger; + } + + [SharpCache(60)] + public async Task>> LoadVocabularyAsync() + { + string foldername = ""; + var vocabulary = new Dictionary>(); + + if (string.IsNullOrEmpty(foldername)) + { + return vocabulary; + } + + // Load CSV files from the folder + var csvFileDict = await LoadCsvFilesFromFolderAsync(foldername); + if (csvFileDict.Count == 0) + { + return vocabulary; + } + + // Load each CSV file + foreach (var (source, filePath) in csvFileDict) + { + try + { + var terms = await LoadCsvFileAsync(filePath); + vocabulary[source] = terms; + _logger.LogInformation($"Loaded {terms.Count} terms for source '{source}' from {filePath}"); + } + catch (Exception ex) + { + _logger.LogError(ex, $"Error loading CSV file for source '{source}': {filePath}"); + } + } + + return vocabulary; + } + + [SharpCache(60)] + public async Task> LoadSynonymMappingAsync() + { + string filename = ""; + var result = new Dictionary(); + if (string.IsNullOrWhiteSpace(filename)) + { + return result; + } + + var searchFolder = Path.Combine(AppContext.BaseDirectory, "data", "plugins", "fuzzySharp"); + var filePath = Path.Combine(searchFolder, filename); + + if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath)) + { + return result; + } + + try + { + using var reader = new StreamReader(filePath); + using var csv = new CsvReader(reader, CreateCsvConfig()); + + await csv.ReadAsync(); + csv.ReadHeader(); + + if (!HasRequiredColumns(csv)) + { + _logger.LogWarning("Synonym mapping file missing required columns: {FilePath}", filePath); + return result; + } + + while (await csv.ReadAsync()) + { + var term = csv.GetField("term") ?? string.Empty; + var dbPath = csv.GetField("dbPath") ?? string.Empty; + var canonicalForm = csv.GetField("canonical_form") ?? string.Empty; + + if (term.Length == 0 || dbPath.Length == 0 || canonicalForm.Length == 0) + { + _logger.LogWarning( + "Missing column(s) in CSV at row {Row}: term={Term}, dbPath={DbPath}, canonical_form={CanonicalForm}", + csv.Parser.RawRow, + term ?? "", + dbPath ?? "", + canonicalForm ?? ""); + continue; + } + + var key = term.ToLowerInvariant(); + result[key] = (dbPath, canonicalForm); + } + + _logger.LogInformation("Loaded synonym mapping from {FilePath}: {Count} terms", filePath, result.Count); + } + catch (Exception ex) + { + _logger.LogError(ex, "Error loading synonym mapping file: {FilePath}", filePath); + } + + return result; + } + + private async Task> LoadCsvFileAsync(string filePath) + { + var terms = new HashSet(StringComparer.OrdinalIgnoreCase); + + if (!File.Exists(filePath)) + { + _logger.LogWarning($"CSV file does not exist: {filePath}"); + return terms; + } + + using var reader = new StreamReader(filePath); + using var csv = new CsvReader(reader, new CsvConfiguration(CultureInfo.InvariantCulture) + { + HasHeaderRecord = false // No header in the CSV files + }); + + while (await csv.ReadAsync()) + { + // Read the first column (assuming it contains the terms) + var term = csv.GetField(0); + if (!string.IsNullOrWhiteSpace(term)) + { + terms.Add(term.Trim()); + } + } + + _logger.LogInformation($"Loaded {terms.Count} terms from {Path.GetFileName(filePath)}"); + return terms; + } + + private async Task> LoadCsvFilesFromFolderAsync(string folderName) + { + var csvFileDict = new Dictionary(); + var searchFolder = Path.Combine(AppContext.BaseDirectory, "data", "plugins", "fuzzySharp", folderName); + if (!Directory.Exists(searchFolder)) + { + _logger.LogWarning($"Folder does not exist: {searchFolder}"); + return csvFileDict; + } + + var csvFiles = Directory.GetFiles(searchFolder, "*.csv"); + foreach (var file in csvFiles) + { + var fileName = Path.GetFileNameWithoutExtension(file); + csvFileDict[fileName] = file; + } + + _logger.LogInformation($"Loaded {csvFileDict.Count} CSV files from {searchFolder}"); + return await Task.FromResult(csvFileDict); + } + + private static CsvConfiguration CreateCsvConfig() + { + return new CsvConfiguration(CultureInfo.InvariantCulture) + { + HasHeaderRecord = true, + DetectColumnCountChanges = true, + MissingFieldFound = null + }; + } + + private static bool HasRequiredColumns(CsvReader csv) + { + return csv.HeaderRecord is { Length: > 0 } headers + && headers.Contains("term") + && headers.Contains("dbPath") + && headers.Contains("canonical_form"); + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/ExactMatcher.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/ExactMatcher.cs new file mode 100644 index 000000000..38e562eff --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/ExactMatcher.cs @@ -0,0 +1,23 @@ +using BotSharp.Plugin.FuzzySharp.FuzzSharp; +using BotSharp.Plugin.FuzzySharp.Constants; + +namespace BotSharp.Plugin.FuzzySharp.Services.Matching; + +public class ExactMatcher : ITokenMatcher +{ + public int Priority => 2; // Second highest priority + + public MatchResult? TryMatch(MatchContext context) + { + if (context.Lookup.TryGetValue(context.ContentLow, out var match)) + { + return new MatchResult( + CanonicalForm: match.CanonicalForm, + Sources: match.Sources, + MatchType: MatchReason.ExactMatch, + Confidence: 1.0); + } + + return null; + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/FuzzyMatcher.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/FuzzyMatcher.cs new file mode 100644 index 000000000..193e28bc6 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/FuzzyMatcher.cs @@ -0,0 +1,81 @@ +using BotSharp.Plugin.FuzzySharp.FuzzSharp; +using System.Text.RegularExpressions; +using FuzzySharp; +using FuzzySharp.SimilarityRatio; +using FuzzySharp.SimilarityRatio.Scorer.StrategySensitive; +using BotSharp.Plugin.FuzzySharp.Constants; + +namespace BotSharp.Plugin.FuzzySharp.Services.Matching; + +public class FuzzyMatcher : ITokenMatcher +{ + public int Priority => 1; // Lowest priority + + public MatchResult? TryMatch(MatchContext context) + { + var match = CheckTypoCorrection(context.ContentSpan, context.Lookup, context.Cutoff); + if (match == null) + { + return null; + } + + var (canonicalForm, sources, confidence) = match.Value; + return new MatchResult( + CanonicalForm: canonicalForm, + Sources: sources, + MatchType: MatchReason.TypoCorrection, + Confidence: confidence); + } + + ///

+ /// Check typo correction using fuzzy matching + ///

+ private (string CanonicalForm, List Sources, double Confidence)? CheckTypoCorrection( + string contentSpan, + Dictionary Sources)> lookup, + double cutoff) + { + // Convert cutoff to 0-100 scale for FuzzySharp + var scoreCutoff = (int)(cutoff * 100); + + // Get all candidates from lookup + var candidates = lookup.Keys.ToList(); + + // Find best match using ExtractOne + var scorer = ScorerCache.Get(); + var result = Process.ExtractOne( + contentSpan, + candidates, + candidate => Normalize(candidate), // Preprocessor function + scorer, + scoreCutoff // Score cutoff + ); + + if (result == null) + { + return null; + } + + // Get the canonical form and sources from lookup + var match = lookup[result.Value]; + return (match.CanonicalForm, match.Sources, Math.Round(result.Score / 100.0, 3)); + } + + ///

+ /// Normalize text for fuzzy matching comparison + /// - Replaces all non-word characters (except apostrophes) with spaces + /// - Converts to lowercase + /// - Collapses multiple spaces into single space + /// - Trims leading/trailing whitespace + /// Example: "Test-Value (123)" → "test value 123" + ///

+ /// Text to normalize + /// Normalized text suitable for fuzzy matching + private string Normalize(string text) + { + // Replace non-word characters (except apostrophes) with spaces + var normalized = Regex.Replace(text, @"[^\w']+", " ", RegexOptions.IgnoreCase); + // Convert to lowercase, collapse multiple spaces, and trim + return Regex.Replace(normalized.ToLowerInvariant(), @"\s+", " ").Trim(); + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/SynonymMatcher.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/SynonymMatcher.cs new file mode 100644 index 000000000..9f6d8f97d --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/SynonymMatcher.cs @@ -0,0 +1,23 @@ +using BotSharp.Plugin.FuzzySharp.FuzzSharp; +using BotSharp.Plugin.FuzzySharp.Constants; + +namespace BotSharp.Plugin.FuzzySharp.Services.Matching; + +public class SynonymMatcher : ITokenMatcher +{ + public int Priority => 3; // Highest priority + + public MatchResult? TryMatch(MatchContext context) + { + if (context.SynonymMapping.TryGetValue(context.ContentLow, out var match)) + { + return new MatchResult( + CanonicalForm: match.CanonicalForm, + Sources: new List { match.DbPath }, + MatchType: MatchReason.SynonymMatch, + Confidence: 1.0); + } + + return null; + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/PhraseService.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/PhraseService.cs new file mode 100644 index 000000000..cd05ca6a6 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/PhraseService.cs @@ -0,0 +1,199 @@ +using BotSharp.Plugin.FuzzySharp.FuzzSharp; +using BotSharp.Plugin.FuzzySharp.FuzzSharp.Arguments; +using BotSharp.Plugin.FuzzySharp.FuzzSharp.Models; +using BotSharp.Abstraction.Knowledges; +using BotSharp.Abstraction.Knowledges.Models; +using BotSharp.Plugin.FuzzySharp.Utils; +using Microsoft.Extensions.Logging; +using System.Diagnostics; + +namespace BotSharp.Plugin.FuzzySharp.Services; + +public class PhraseService : IPhraseService +{ + private readonly ILogger _logger; + private readonly IEnumerable _phraseLoaderServices; + private readonly INgramProcessor _ngramProcessor; + private readonly IResultProcessor _resultProcessor; + + public PhraseService( + ILogger logger, + IEnumerable phraseLoaderServices, + INgramProcessor ngramProcessor, + IResultProcessor resultProcessor) + { + _logger = logger; + _phraseLoaderServices = phraseLoaderServices; + _ngramProcessor = ngramProcessor; + _resultProcessor = resultProcessor; + } + + public Task> SearchPhrasesAsync(string term) + { + var request = BuildTextAnalysisRequest(term); + var response = AnalyzeTextAsync(request); + return response.ContinueWith(t => + { + var results = t.Result.Flagged.Select(f => new SearchPhrasesResult + { + Token = f.Token, + Sources = f.Sources, + CanonicalForm = f.CanonicalForm, + MatchType = f.MatchType, + Confidence = f.Confidence + }).ToList(); + return results; + }); + } + + private TextAnalysisRequest BuildTextAnalysisRequest(string inputText) + { + return new TextAnalysisRequest + { + Text = inputText + }; + } + + ///

+ /// Analyze text for typos and entities using domain-specific vocabulary + ///

+ private async Task AnalyzeTextAsync(TextAnalysisRequest request) + { + var stopwatch = Stopwatch.StartNew(); + try + { + // Tokenize the text + var tokens = TextTokenizer.Tokenize(request.Text); + + // Load vocabulary + var vocabulary = await LoadAllVocabularyAsync(); + + // Load synonym mapping + var synonymMapping = await LoadAllSynonymMappingAsync(); + + // Analyze text + var flagged = AnalyzeTokens(tokens, vocabulary, synonymMapping, request); + + stopwatch.Stop(); + + var response = new TextAnalysisResponse + { + Original = request.Text, + Flagged = flagged, + ProcessingTimeMs = Math.Round(stopwatch.Elapsed.TotalMilliseconds, 2) + }; + + if (request.IncludeTokens) + { + response.Tokens = tokens; + } + + _logger.LogInformation( + $"Text analysis completed in {response.ProcessingTimeMs}ms | " + + $"Text length: {request.Text.Length} chars | " + + $"Flagged items: {flagged.Count}"); + + return response; + } + catch (Exception) + { + stopwatch.Stop(); + throw; + } + } + + public async Task>> LoadAllVocabularyAsync() + { + var results = await Task.WhenAll(_phraseLoaderServices.Select(c => c.LoadVocabularyAsync())); + var merged = new Dictionary>(); + + foreach (var dict in results) + { + foreach (var kvp in dict) + { + if (!merged.TryGetValue(kvp.Key, out var set)) + merged[kvp.Key] = new HashSet(kvp.Value); + else + set.UnionWith(kvp.Value); + } + } + + return merged; + } + + public async Task> LoadAllSynonymMappingAsync() + { + var results = await Task.WhenAll(_phraseLoaderServices.Select(c => c.LoadSynonymMappingAsync())); + var merged = new Dictionary(); + + foreach (var dict in results) + { + foreach (var kvp in dict) + merged[kvp.Key] = kvp.Value; // later entries override earlier ones + } + + return merged; + } + + ///

+ /// Analyze tokens for typos and entities + ///

+ private List AnalyzeTokens( + List tokens, + Dictionary> vocabulary, + Dictionary synonymMapping, + TextAnalysisRequest request) + { + // Build lookup table for O(1) exact match lookups (matching Python's build_lookup) + var lookup = BuildLookup(vocabulary); + + // Process n-grams and find matches + var flagged = _ngramProcessor.ProcessNgrams( + tokens, + vocabulary, + synonymMapping, + lookup, + request.MaxNgram, + request.Cutoff, + request.TopK); + + // Process results: deduplicate and sort + return _resultProcessor.ProcessResults(flagged); + } + + ///

+ /// Build a lookup dictionary mapping lowercase terms to their canonical form and sources. + /// This is a performance optimization - instead of iterating through all sources for each lookup, + /// we build a flat dictionary once at the start. + /// + /// Matches Python's build_lookup() function. + ///

+ private Dictionary Sources)> BuildLookup( + Dictionary> vocabulary) + { + var lookup = new Dictionary Sources)>(); + + foreach (var (source, terms) in vocabulary) + { + foreach (var term in terms) + { + var key = term.ToLowerInvariant(); + if (lookup.TryGetValue(key, out var existing)) + { + // Term already exists - add this source to the list if not already there + if (!existing.Sources.Contains(source)) + { + existing.Sources.Add(source); + } + } + else + { + // New term - create entry with single source in list + lookup[key] = (term, new List { source }); + } + } + } + + return lookup; + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Processors/NgramProcessor.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Processors/NgramProcessor.cs new file mode 100644 index 000000000..86e584067 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Processors/NgramProcessor.cs @@ -0,0 +1,131 @@ +using BotSharp.Plugin.FuzzySharp.FuzzSharp; +using BotSharp.Plugin.FuzzySharp.FuzzSharp.Models; + +namespace BotSharp.Plugin.FuzzySharp.Services.Processors; + +public class NgramProcessor : INgramProcessor +{ + private readonly List _matchers; + + public NgramProcessor(IEnumerable matchers) + { + // Sort matchers by priority (highest first) + _matchers = matchers.OrderByDescending(m => m.Priority).ToList(); + } + + public List ProcessNgrams( + List tokens, + Dictionary> vocabulary, + Dictionary synonymMapping, + Dictionary Sources)> lookup, + int maxNgram, + double cutoff, + int topK) + { + var flagged = new List(); + + // Process n-grams from largest to smallest + for (int n = maxNgram; n >= 1; n--) + { + for (int i = 0; i <= tokens.Count - n; i++) + { + var item = ProcessSingleNgram( + tokens, + i, + n, + vocabulary, + synonymMapping, + lookup, + cutoff, + topK); + + if (item != null) + { + flagged.Add(item); + } + } + } + + return flagged; + } + + ///

+ /// Process a single n-gram at the specified position + ///

+ private FlaggedItem? ProcessSingleNgram( + List tokens, + int startIdx, + int n, + Dictionary> vocabulary, + Dictionary synonymMapping, + Dictionary Sources)> lookup, + double cutoff, + int topK) + { + // Extract content span + var (contentSpan, spanTokens, contentIndices) = ExtractContentSpan(tokens, startIdx, n); + if (string.IsNullOrWhiteSpace(contentSpan)) + { + return null; + } + + var contentLow = contentSpan.ToLowerInvariant(); + + // Try matching in priority order using matchers + var context = new MatchContext( + contentSpan, + contentLow, + startIdx, + n, + vocabulary, + synonymMapping, + lookup, + cutoff, + topK); + + foreach (var matcher in _matchers) + { + var matchResult = matcher.TryMatch(context); + if (matchResult != null) + { + return CreateFlaggedItem(matchResult, startIdx, contentSpan, n); + } + } + + return null; + } + + ///

+ /// Create a FlaggedItem from a MatchResult + ///

+ private FlaggedItem CreateFlaggedItem( + MatchResult matchResult, + int startIndex, + string contentSpan, + int ngramLength) + { + return new FlaggedItem + { + Index = startIndex, + Token = contentSpan, + Sources = matchResult.Sources, + MatchType = matchResult.MatchType, + CanonicalForm = matchResult.CanonicalForm, + Confidence = matchResult.Confidence, + NgramLength = ngramLength + }; + } + + ///

+ /// Extract content span + ///

+ private (string ContentSpan, List Tokens, List ContentIndices) ExtractContentSpan( + List tokens, + int startIdx, + int n) + { + var span = tokens.Skip(startIdx).Take(n).ToList(); + var indices = Enumerable.Range(startIdx, n).ToList(); + return (string.Join(" ", span), span, indices); + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Processors/ResultProcessor.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Processors/ResultProcessor.cs new file mode 100644 index 000000000..ea402804d --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Processors/ResultProcessor.cs @@ -0,0 +1,102 @@ +using BotSharp.Plugin.FuzzySharp.FuzzSharp; +using BotSharp.Plugin.FuzzySharp.FuzzSharp.Models; +using BotSharp.Plugin.FuzzySharp.Constants; + +namespace BotSharp.Plugin.FuzzySharp.Services.Processors; + +public class ResultProcessor : IResultProcessor +{ + public List ProcessResults(List flagged) + { + // Remove overlapping duplicates + var deduped = RemoveOverlappingDuplicates(flagged); + + // Sort by confidence (descending), then match_type (alphabetically) + // This matches Python's _sort_and_format_results function + return deduped + .OrderByDescending(f => f.Confidence) + .ThenBy(f => f.MatchType) + .ToList(); + } + + ///

+ /// Remove overlapping detections with the same canonical form. + /// When multiple detections overlap and have the same canonical_form, + /// keep only the best one based on: + /// 1. Prefer synonym_match over exact_match over typo_correction (matches matcher priority) + /// 2. Highest confidence + /// 3. Shortest n-gram length + ///

+ private List RemoveOverlappingDuplicates(List flagged) + { + var deduped = new List(); + var skipIndices = new HashSet(); + + for (int i = 0; i < flagged.Count; i++) + { + if (skipIndices.Contains(i)) + { + continue; + } + + var item = flagged[i]; + var itemRange = (item.Index, item.Index + item.NgramLength); + + // Find all overlapping items with same canonical_form (regardless of match_type) + var overlappingGroup = new List { item }; + for (int j = i + 1; j < flagged.Count; j++) + { + if (skipIndices.Contains(j)) + { + continue; + } + + var other = flagged[j]; + if (item.CanonicalForm == other.CanonicalForm) + { + var otherRange = (other.Index, other.Index + other.NgramLength); + if (RangesOverlap(itemRange, otherRange)) + { + overlappingGroup.Add(other); + skipIndices.Add(j); + } + } + } + + // Keep the best item from the overlapping group + // Priority: synonym_match (3) > exact_match (2) > typo_correction (1) + // Then highest confidence, then shortest ngram + var bestItem = overlappingGroup + .OrderByDescending(x => GetMatchTypePriority(x.MatchType)) + .ThenByDescending(x => x.Confidence) + .ThenBy(x => x.NgramLength) + .First(); + deduped.Add(bestItem); + } + + return deduped; + } + + ///

+ /// Get priority value for match type (higher is better) + /// Matches the priority order in matchers: synonym > exact > fuzzy + ///

+ private int GetMatchTypePriority(string matchType) + { + return matchType switch + { + MatchReason.SynonymMatch => 3, // Highest priority + MatchReason.ExactMatch => 2, // Second priority + MatchReason.TypoCorrection => 1, // Lowest priority + _ => 0 // Unknown types get lowest priority + }; + } + + ///

+ /// Check if two token ranges overlap. + ///

+ private bool RangesOverlap((int start, int end) range1, (int start, int end) range2) + { + return range1.start < range2.end && range2.start < range1.end; + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Using.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Using.cs new file mode 100644 index 000000000..1a0fe1eab --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Using.cs @@ -0,0 +1,5 @@ +global using System; +global using System.Collections.Generic; +global using System.Linq; +global using System.Text; +global using System.Threading.Tasks; diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Utils/TextTokenizer.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Utils/TextTokenizer.cs new file mode 100644 index 000000000..8853733a2 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Utils/TextTokenizer.cs @@ -0,0 +1,63 @@ +using BotSharp.Plugin.FuzzySharp.Constants; + +namespace BotSharp.Plugin.FuzzySharp.Utils; + +public static class TextTokenizer +{ + ///

+ /// Preprocess text: add spaces before and after characters that need to be separated + /// This allows subsequent simple whitespace tokenization to correctly separate these characters + /// Example: "(IH)" -> " ( IH ) " -> ["(", "IH", ")"] + ///

+ /// Text to preprocess + /// Preprocessed text + public static string PreprocessText(string text) + { + if (string.IsNullOrWhiteSpace(text)) + { + return text; + } + + var result = new StringBuilder(text.Length * 2); + + foreach (var ch in text) + { + // If it's a character that needs to be separated, add spaces before and after + if (TextConstants.SeparatorChars.Contains(ch)) + { + result.Append(' '); + result.Append(ch); + result.Append(' '); + } + else + { + result.Append(ch); + } + } + + return result.ToString(); + } + + ///

+ /// Simple whitespace tokenization + /// Should be called after preprocessing text with PreprocessText + ///

+ /// Text to tokenize + /// List of tokens + public static List SimpleTokenize(string text) + { + return text.Split(TextConstants.TokenSeparators, StringSplitOptions.RemoveEmptyEntries).ToList(); + } + + ///

+ /// Complete tokenization flow: preprocessing + tokenization + /// This is the recommended usage + ///

+ /// Text to tokenize + /// List of tokens + public static List Tokenize(string text) + { + var preprocessed = PreprocessText(text); + return SimpleTokenize(preprocessed); + } +} diff --git a/src/WebStarter/WebStarter.csproj b/src/WebStarter/WebStarter.csproj index 5a7c6eb7b..082ac578e 100644 --- a/src/WebStarter/WebStarter.csproj +++ b/src/WebStarter/WebStarter.csproj @@ -37,6 +37,7 @@ + diff --git a/src/WebStarter/appsettings.json b/src/WebStarter/appsettings.json index be70e0d47..5cd9c0d6d 100644 --- a/src/WebStarter/appsettings.json +++ b/src/WebStarter/appsettings.json @@ -890,7 +890,8 @@ "BotSharp.Plugin.ExcelHandler", "BotSharp.Plugin.SqlDriver", "BotSharp.Plugin.TencentCos", - "BotSharp.Plugin.PythonInterpreter" + "BotSharp.Plugin.PythonInterpreter", + "BotSharp.Plugin.FuzzySharp" ] } }