Codestin Search App

diff --git a/eng/Versions.props b/eng/Versions.props index 27815de8359..cfb06111922 100644 --- a/eng/Versions.props +++ b/eng/Versions.props @@ -138,6 +138,7 @@ Win-x64 is used here because we have picked an arbitrary runtime identifier to flow the version of the latest NETCore.App runtime. All Runtime.$rid packages should have the same version. --> + 1.0.1 $(MicrosoftNETCoreAppRuntimewinx64Version) 4.8.0 diff --git a/eng/packages/General-LTS.props b/eng/packages/General-LTS.props index 4e24ca6630f..884d874c5e1 100644 --- a/eng/packages/General-LTS.props +++ b/eng/packages/General-LTS.props @@ -23,6 +23,7 @@ + diff --git a/eng/packages/General-net9.props b/eng/packages/General-net9.props index c477508b1b2..341f69458a8 100644 --- a/eng/packages/General-net9.props +++ b/eng/packages/General-net9.props @@ -23,6 +23,7 @@ + diff --git a/eng/packages/General.props b/eng/packages/General.props index 6872259407d..a19e9d2bb91 100644 --- a/eng/packages/General.props +++ b/eng/packages/General.props @@ -1,6 +1,7 @@ + @@ -10,6 +11,7 @@ + diff --git a/eng/packages/TestOnly.props b/eng/packages/TestOnly.props index bd7fe7d01d3..d5dd228c9cb 100644 --- a/eng/packages/TestOnly.props +++ b/eng/packages/TestOnly.props @@ -3,6 +3,7 @@ + @@ -12,7 +13,7 @@ - + diff --git a/eng/pipelines/templates/BuildAndTest.yml b/eng/pipelines/templates/BuildAndTest.yml index e3ec5bcd7dd..e3814264015 100644 --- a/eng/pipelines/templates/BuildAndTest.yml +++ b/eng/pipelines/templates/BuildAndTest.yml @@ -23,6 +23,12 @@ parameters: default: false steps: + - task: NodeTool@0 + displayName: Add NodeJS/npm + inputs: + versionSpec: "20.x" + checkLatest: true + - script: ${{ parameters.buildScript }} -restore /bl:${{ parameters.repoLogPath }}/restore.binlog @@ -45,6 +51,11 @@ steps: $(_OfficialBuildIdArgs) displayName: Build + - ${{ if eq(parameters.isWindows, 'true') }}: + - pwsh: | + $(Build.SourcesDirectory)/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/azure-devops-report/build.ps1 -OutputPath $(Build.ArtifactStagingDirectory)\VSIX + displayName: Build Azure DevOps plugin + - ${{ if ne(parameters.skipTests, 'true') }}: - script: $(Build.SourcesDirectory)/.dotnet/dotnet dotnet-coverage collect --settings $(Build.SourcesDirectory)/eng/CodeCoverage.config diff --git a/eng/xunit.runner.json b/eng/xunit.runner.json index 6e96fab28c8..826972feba6 100644 --- a/eng/xunit.runner.json +++ b/eng/xunit.runner.json @@ -1,4 +1,5 @@ { "diagnosticMessages": true, - "longRunningTestSeconds": 300 + "longRunningTestSeconds": 300, + "shadowCopy": false } diff --git a/scripts/ConfigureEvaluationTests.ps1 b/scripts/ConfigureEvaluationTests.ps1 new file mode 100644 index 00000000000..d58cb1352db --- /dev/null +++ b/scripts/ConfigureEvaluationTests.ps1 @@ -0,0 +1,65 @@ +#!/usr/bin/env pwsh + +<# +.SYNOPSIS + Configures local repo for online evaluation tests, which use external resources. + +.DESCRIPTION + This script copies appsettings files from a location on the developer's machine to the test + project directories so that the tests are configured to connect to external resources. The online + configuration files are gitignore'd and not checked in to the repo. + +.PARAMETER Configure + Configure this repo for online evaluation tests, by copying appsettings files from the developer's + machine to this repo. +.PARAMETER Unconfigure + Unconfigure this repo for online evaluation tests, by removing the appsettings files from this repo. +.PARAMETER ConfigRoot + ConfigRoot specifies where to copy the configuration files from. The default is $HOME/.config/dotnet-extensions. +#> + +param ( + [switch]$Configure=$False, + [switch]$Unconfigure=$False, + [string]$ConfigRoot=$Null +) + +Write-Host "$PSScriptRoot" + +if ($Configure -and $Unconfigure) { + Write-Error -Message "Cannot specify both -Configure and -Unconfigure" + Exit 1 +} + +if (!(Test-Path $ConfigRoot)) { + $ConfigRoot = "$HOME/.config/dotnet-extensions" +} + +$ProjectRoot = Resolve-Path "$PSScriptRoot/../test/Libraries" +$ReportingConfig = "Microsoft.Extensions.AI.Evaluation.Reporting.Tests/appsettings.local.json" +$IntegrationConfig = "Microsoft.Extensions.AI.Evaluation.Integration.Tests/appsettings.local.json" + +if ($Configure) { + if (!(Test-Path -Path "$ConfigRoot/$ReportingConfig")) { + Write-Host "No configuration found at $ConfigRoot/$ReportingConfig" + Exit 0 + } + if (!(Test-Path -Path "$ConfigRoot/$IntegrationConfig")) { + Write-Host "No configuration found at $ConfigRoot/$IntegrationConfig" + Exit 0 + } + + Copy-Item -Path "$ConfigRoot/$ReportingConfig" -Destination "$ProjectRoot/$ReportingConfig" -Force + Copy-Item -Path "$ConfigRoot/$IntegrationConfig" -Destination "$ProjectRoot/$IntegrationConfig" -Force + + Write-Host "Test configured to use external resources" +} elseif ($Unconfigure) { + Remove-Item -Path "$ProjectRoot/$ReportingConfig" -Force + Remove-Item -Path "$ProjectRoot/$IntegrationConfig" -Force + + Write-Host "Test unconfigured from using external resources" +} else { + Write-Error -Message "Must specify either -Configure or -Unconfigure" + Exit 1 +} + diff --git a/src/Libraries/Directory.Build.props b/src/Libraries/Directory.Build.props index 5dabc150a2c..ba90d284ea6 100644 --- a/src/Libraries/Directory.Build.props +++ b/src/Libraries/Directory.Build.props @@ -11,7 +11,6 @@ true true true - true true diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Commands/CleanCacheCommand.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Commands/CleanCacheCommand.cs new file mode 100644 index 00000000000..2bea0ed0efd --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Commands/CleanCacheCommand.cs @@ -0,0 +1,28 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.IO; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Console.Utilities; +using Microsoft.Extensions.AI.Evaluation.Reporting.Storage; +using Microsoft.Extensions.Logging; + +namespace Microsoft.Extensions.AI.Evaluation.Console.Commands; + +internal sealed class CleanCacheCommand(ILogger logger) +{ + internal async Task InvokeAsync(DirectoryInfo storageRootDir, CancellationToken cancellationToken = default) + { + string storageRootPath = storageRootDir.FullName; + logger.LogInformation("Storage root path: {storageRootPath}", storageRootPath); + logger.LogInformation("Deleting expired cache entries..."); + + var cacheProvider = new DiskBasedResponseCacheProvider(storageRootPath); + + await logger.ExecuteWithCatchAsync( + () => cacheProvider.DeleteExpiredCacheEntriesAsync(cancellationToken)).ConfigureAwait(false); + + return 0; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Commands/CleanResultsCommand.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Commands/CleanResultsCommand.cs new file mode 100644 index 00000000000..9489e5b6e92 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Commands/CleanResultsCommand.cs @@ -0,0 +1,63 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.IO; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Console.Utilities; +using Microsoft.Extensions.AI.Evaluation.Reporting.Storage; +using Microsoft.Extensions.Logging; + +namespace Microsoft.Extensions.AI.Evaluation.Console.Commands; + +internal sealed class CleanResultsCommand(ILogger logger) +{ + internal async Task InvokeAsync( + DirectoryInfo storageRootDir, + int lastN, + CancellationToken cancellationToken = default) + { + string storageRootPath = storageRootDir.FullName; + logger.LogInformation("Storage root path: {storageRootPath}", storageRootPath); + + var resultStore = new DiskBasedResultStore(storageRootPath); + + await logger.ExecuteWithCatchAsync( + async ValueTask () => + { + if (lastN is 0) + { + logger.LogInformation("Deleting all results..."); + + await resultStore.DeleteResultsAsync(cancellationToken: cancellationToken).ConfigureAwait(false); + } + else + { + logger.LogInformation("Deleting all results except the {lastN} most recent ones...", lastN); + + HashSet toPreserve = []; + + await foreach (string executionName in + resultStore.GetLatestExecutionNamesAsync(lastN, cancellationToken).ConfigureAwait(false)) + { + _ = toPreserve.Add(executionName); + } + + await foreach (string executionName in + resultStore.GetLatestExecutionNamesAsync( + cancellationToken: cancellationToken).ConfigureAwait(false)) + { + if (!toPreserve.Contains(executionName)) + { + await resultStore.DeleteResultsAsync( + executionName, + cancellationToken: cancellationToken).ConfigureAwait(false); + } + } + } + }).ConfigureAwait(false); + + return 0; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Commands/ReportCommand.Format.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Commands/ReportCommand.Format.cs new file mode 100644 index 00000000000..2644e2bfd4c --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Commands/ReportCommand.Format.cs @@ -0,0 +1,13 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace Microsoft.Extensions.AI.Evaluation.Console.Commands; + +internal partial class ReportCommand +{ + internal enum Format + { + html, + json + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Commands/ReportCommand.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Commands/ReportCommand.cs new file mode 100644 index 00000000000..ec7f659edd0 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Commands/ReportCommand.cs @@ -0,0 +1,63 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Reporting; +using Microsoft.Extensions.AI.Evaluation.Reporting.Formats.Html; +using Microsoft.Extensions.AI.Evaluation.Reporting.Formats.Json; +using Microsoft.Extensions.AI.Evaluation.Reporting.Storage; +using Microsoft.Extensions.Logging; + +namespace Microsoft.Extensions.AI.Evaluation.Console.Commands; + +internal sealed partial class ReportCommand(ILogger logger) +{ + internal async Task InvokeAsync( + DirectoryInfo storageRootDir, + FileInfo outputFile, + int lastN, + Format format, + CancellationToken cancellationToken = default) + { + string storageRootPath = storageRootDir.FullName; + logger.LogInformation("Storage root path: {storageRootPath}", storageRootPath); + + var results = new List(); + var resultStore = new DiskBasedResultStore(storageRootPath); + + await foreach (string executionName in + resultStore.GetLatestExecutionNamesAsync(lastN, cancellationToken).ConfigureAwait(false)) + { + await foreach (ScenarioRunResult result in + resultStore.ReadResultsAsync( + executionName, + cancellationToken: cancellationToken).ConfigureAwait(false)) + { + results.Add(result); + } + } + + string outputFilePath = outputFile.FullName; + string? outputPath = Path.GetDirectoryName(outputFilePath); + if (outputPath is not null && !Directory.Exists(outputPath)) + { + _ = Directory.CreateDirectory(outputPath); + } + + IEvaluationReportWriter reportWriter = format switch + { + Format.html => new HtmlReportWriter(outputFilePath), + Format.json => new JsonReportWriter(outputFilePath), + _ => throw new NotSupportedException(), + }; + + await reportWriter.WriteReportAsync(results, cancellationToken).ConfigureAwait(false); + logger.LogInformation("Report: {outputFilePath} [{format}]", outputFilePath, format); + + return 0; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Microsoft.Extensions.AI.Evaluation.Console.csproj b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Microsoft.Extensions.AI.Evaluation.Console.csproj new file mode 100644 index 00000000000..209d13226ac --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Microsoft.Extensions.AI.Evaluation.Console.csproj @@ -0,0 +1,66 @@ + + + + A dotnet tool for managing the evaluation data and generating reports. + Exe + $(NetCoreTargetFrameworks) + Microsoft.Extensions.AI.Evaluation.Console + + $(NoWarn);EA0000 + true + aieval + + + + AIEval + preview + true + false + 88 + 0 + + + + + + + + + + + + + + + + + + + + $(IntermediateOutputPath)Constants.g.cs + +// +// This file is auto-generated by MSBuild. +// + +namespace Microsoft.Extensions.AI.Evaluation.Console%3B + +internal static class Constants +{ + public const string Version = "$(Version)"%3B +} + + + + + + + + + + + + diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Program.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Program.cs new file mode 100644 index 00000000000..5158dbe4262 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Program.cs @@ -0,0 +1,109 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#if DEBUG +using System.CommandLine.Parsing; +using System.Diagnostics; +#endif +using System.CommandLine; +using System.IO; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Console.Commands; +using Microsoft.Extensions.Logging; + +namespace Microsoft.Extensions.AI.Evaluation.Console; + +internal sealed class Program +{ + private const string Name = "Microsoft.Extensions.AI.Evaluation.Console"; + private const string Banner = $"{Name} [{Constants.Version}]"; + +#pragma warning disable EA0014 // Async methods should support cancellation. + private static async Task Main(string[] args) +#pragma warning restore EA0014 + { + using ILoggerFactory factory = LoggerFactory.Create(builder => builder.AddConsole()); + ILogger logger = factory.CreateLogger(Name); + logger.LogInformation("{banner}", Banner); + + var rootCmd = new RootCommand(Banner); + +#if DEBUG + var debugOpt = new Option(["--debug"], "Debug on startup") { IsHidden = true }; + rootCmd.AddGlobalOption(debugOpt); +#endif + + var reportCmd = new Command("report", "Generate a report "); + + var pathOpt = + new Option( + ["-p", "--path"], + "Root path under which the cache and results are stored") + { + IsRequired = true + }; + + reportCmd.AddOption(pathOpt); + + var outputOpt = new Option(["-o", "--output"], "Output filename/path") { IsRequired = true }; + reportCmd.AddOption(outputOpt); + + var lastNOpt = new Option(["-n"], () => 1, "Number of most recent executions to include in the report."); + reportCmd.AddOption(lastNOpt); + + var formatOpt = + new Option( + "--format", + () => ReportCommand.Format.html, + "Specify the format for the generated report."); + + reportCmd.AddOption(formatOpt); + + reportCmd.SetHandler( + (path, output, lastN, format) => new ReportCommand(logger).InvokeAsync(path, output, lastN, format), + pathOpt, + outputOpt, + lastNOpt, + formatOpt); + + rootCmd.Add(reportCmd); + + // TASK: Support more granular filters such as the specific scenario / iteration / execution whose results must + // be cleaned up. + var cleanResults = new Command("cleanResults", "Delete results"); + cleanResults.AddOption(pathOpt); + + var lastNOpt2 = new Option(["-n"], () => 0, "Number of most recent executions to preserve."); + cleanResults.AddOption(lastNOpt2); + + cleanResults.SetHandler( + (path, lastN) => new CleanResultsCommand(logger).InvokeAsync(path, lastN), + pathOpt, + lastNOpt2); + + rootCmd.Add(cleanResults); + + var cleanCache = new Command("cleanCache", "Delete expired cache entries"); + cleanCache.AddOption(pathOpt); + + cleanCache.SetHandler( + path => new CleanCacheCommand(logger).InvokeAsync(path), + pathOpt); + + rootCmd.Add(cleanCache); + + // TASK: Support some mechanism to fail a build (i.e. return a failure exit code) based on one or more user + // specified criteria (e.g., if x% of metrics were deemed 'poor'). Ideally this mechanism would be flexible / + // extensible enough to allow users to configure multiple different kinds of failure criteria. + +#if DEBUG + ParseResult parseResult = rootCmd.Parse(args); + if (parseResult.HasOption(debugOpt)) + { + Debugger.Launch(); + } +#endif + + return await rootCmd.InvokeAsync(args).ConfigureAwait(false); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/README.md new file mode 100644 index 00000000000..09345b5e58c --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/README.md @@ -0,0 +1,46 @@ +# The Microsoft.Extensions.AI.Evaluation libraries + +`Microsoft.Extensions.AI.Evaluation` is a set of .NET libraries defined in the following NuGet packages that have been designed to work together to support building processes for evaluating the quality of AI software. + +* [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation. +* [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Equivalence and Groundedness. +* [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. +* [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. +* [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. + +## Install the packages + +From the command-line: + +```console +dotnet add package Microsoft.Extensions.AI.Evaluation +dotnet add package Microsoft.Extensions.AI.Evaluation.Quality +dotnet add package Microsoft.Extensions.AI.Evaluation.Reporting +``` + +Or directly in the C# project file: + +```xml + + + + + +``` + +You can optionally add the `Microsoft.Extensions.AI.Evaluation.Reporting.Azure` package in either of these places if you need Azure Storage support. + +## Install the command line tool + +```console +dotnet tool install Microsoft.Extensions.AI.Evaluation.Console --create-manifest-if-needed +``` + +## Usage Examples + +For a comprehensive tour of all the functionality, concepts and APIs available in the `Microsoft.Extensions.AI.Evaluation` libraries, check out the [API Usage Examples](https://github.com/dotnet/ai-samples/blob/main/src/microsoft-extensions-ai-evaluation/api/) available in the [dotnet/ai-samples](https://github.com/dotnet/ai-samples) repo. These examples are structured as a collection of unit tests. Each unit test showcases a specific concept or API, and builds on the concepts and APIs showcased in previous unit tests. + + +## Feedback & Contributing + +We welcome feedback and contributions in [our GitHub repo](https://github.com/dotnet/extensions). diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Utilities/ExceptionUtilities.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Utilities/ExceptionUtilities.cs new file mode 100644 index 00000000000..b96c564b1ce --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Utilities/ExceptionUtilities.cs @@ -0,0 +1,52 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; + +namespace Microsoft.Extensions.AI.Evaluation.Console.Utilities; + +internal static class ExceptionUtilities +{ + internal static bool IsCancellation(this Exception exception) => + exception switch + { + OperationCanceledException => true, + AggregateException aggregateException => aggregateException.ContainsOnlyCancellations(), + _ => false + }; + + private static bool ContainsOnlyCancellations(this AggregateException exception) + { + var toCheck = new Stack(); + toCheck.Push(exception); + + var seen = new HashSet(); + bool containsAtLeastOneCancellation = false; + + while (toCheck.TryPop(out Exception? current)) + { + // To avoid infinite loops, ignore exceptions that were already seen. + if (seen.Add(current)) + { + if (current is AggregateException aggregateException) + { + foreach (var innerException in aggregateException.InnerExceptions) + { + toCheck.Push(innerException); + } + } + else if (current is OperationCanceledException) + { + containsAtLeastOneCancellation = true; + } + else + { + return false; + } + } + } + + return containsAtLeastOneCancellation; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Utilities/LoggerExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Utilities/LoggerExtensions.cs new file mode 100644 index 00000000000..e502516f83b --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Utilities/LoggerExtensions.cs @@ -0,0 +1,274 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable EA0014 +// EA0014: Async methods should support cancellation. +// We disable this warning because the helpers in this file are wrapper functions that don't themselves perform any +// cancellable operations. + +using System; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; + +namespace Microsoft.Extensions.AI.Evaluation.Console.Utilities; + +internal static class LoggerExtensions +{ + internal static bool LogException(this ILogger logger, Exception exception) + { + logger.LogError(exception, message: null); + return true; + } + + internal static void ExecuteWithCatch( + this ILogger logger, + Action action, + bool swallowUnhandledExceptions = false) + { + try + { + action(); + } + catch (Exception ex) when (swallowUnhandledExceptions && ex.IsCancellation()) + { + // Do nothing. + } + catch (Exception ex) when (!ex.IsCancellation() && logger.LogException(ex) && swallowUnhandledExceptions) + { + // Do nothing. The exception is logged in the when clause above. + } + } + + internal static void ExecuteWithCatch( + this ILogger logger, + Action action, + TArgument argument, + bool swallowUnhandledExceptions = false) + { + try + { + action(argument); + } + catch (Exception ex) when (swallowUnhandledExceptions && ex.IsCancellation()) + { + // Do nothing. + } + catch (Exception ex) when (!ex.IsCancellation() && logger.LogException(ex) && swallowUnhandledExceptions) + { + // Do nothing. The exception is logged in the when clause above. + } + } + + internal static TResult? ExecuteWithCatch( + this ILogger logger, + Func action, + TResult? defaultValue = default, + bool swallowUnhandledExceptions = false) + { + try + { + return action(); + } + catch (Exception ex) when (swallowUnhandledExceptions && ex.IsCancellation()) + { + // Do nothing. + } + catch (Exception ex) when (!ex.IsCancellation() && logger.LogException(ex) && swallowUnhandledExceptions) + { + // Do nothing. The exception is logged in the when clause above. + } + + return defaultValue; + } + + internal static TResult? ExecuteWithCatch( + this ILogger logger, + Func action, + TArgument argument, + TResult? defaultValue = default, + bool swallowUnhandledExceptions = false) + { + try + { + return action(argument); + } + catch (Exception ex) when (swallowUnhandledExceptions && ex.IsCancellation()) + { + // Do nothing. + } + catch (Exception ex) when (!ex.IsCancellation() && logger.LogException(ex) && swallowUnhandledExceptions) + { + // Do nothing. The exception is logged in the when clause above. + } + + return defaultValue; + } + + internal static async Task ExecuteWithCatchAsync( + this ILogger logger, + Func action, + bool swallowUnhandledExceptions = false) + { + try + { + await action().ConfigureAwait(false); + } + catch (Exception ex) when (swallowUnhandledExceptions && ex.IsCancellation()) + { + // Do nothing. + } + catch (Exception ex) when (!ex.IsCancellation() && logger.LogException(ex) && swallowUnhandledExceptions) + { + // Do nothing. The exception is logged in the when clause above. + } + } + + internal static async ValueTask ExecuteWithCatchAsync( + this ILogger logger, + Func action, + bool swallowUnhandledExceptions = false) + { + try + { + await action().ConfigureAwait(false); + } + catch (Exception ex) when (swallowUnhandledExceptions && ex.IsCancellation()) + { + // Do nothing. + } + catch (Exception ex) when (!ex.IsCancellation() && logger.LogException(ex) && swallowUnhandledExceptions) + { + // Do nothing. The exception is logged in the when clause above. + } + } + + internal static async Task ExecuteWithCatchAsync( + this ILogger logger, + Func action, + TArgument argument, + bool swallowUnhandledExceptions = false) + { + try + { + await action(argument).ConfigureAwait(false); + } + catch (Exception ex) when (swallowUnhandledExceptions && ex.IsCancellation()) + { + // Do nothing. + } + catch (Exception ex) when (!ex.IsCancellation() && logger.LogException(ex) && swallowUnhandledExceptions) + { + // Do nothing. The exception is logged in the when clause above. + } + } + + internal static async ValueTask ExecuteWithCatchAsync( + this ILogger logger, + Func action, + TArgument argument, + bool swallowUnhandledExceptions = false) + { + try + { + await action(argument).ConfigureAwait(false); + } + catch (Exception ex) when (swallowUnhandledExceptions && ex.IsCancellation()) + { + // Do nothing. + } + catch (Exception ex) when (!ex.IsCancellation() && logger.LogException(ex) && swallowUnhandledExceptions) + { + // Do nothing. The exception is logged in the when clause above. + } + } + + internal static async Task ExecuteWithCatchAsync( + this ILogger logger, + Func> action, + TResult? defaultValue = default, + bool swallowUnhandledExceptions = false) + { + try + { + return await action().ConfigureAwait(false); + } + catch (Exception ex) when (swallowUnhandledExceptions && ex.IsCancellation()) + { + // Do nothing. + } + catch (Exception ex) when (!ex.IsCancellation() && logger.LogException(ex) && swallowUnhandledExceptions) + { + // Do nothing. The exception is logged in the when clause above. + } + + return defaultValue; + } + + internal static async ValueTask ExecuteWithCatchAsync( + this ILogger logger, + Func> action, + TResult? defaultValue = default, + bool swallowUnhandledExceptions = false) + { + try + { + return await action().ConfigureAwait(false); + } + catch (Exception ex) when (swallowUnhandledExceptions && ex.IsCancellation()) + { + // Do nothing. + } + catch (Exception ex) when (!ex.IsCancellation() && logger.LogException(ex) && swallowUnhandledExceptions) + { + // Do nothing. The exception is logged in the when clause above. + } + + return defaultValue; + } + + internal static async Task ExecuteWithCatchAsync( + this ILogger logger, + Func> action, + TArgument argument, + TResult? defaultValue = default, + bool swallowUnhandledExceptions = false) + { + try + { + return await action(argument).ConfigureAwait(false); + } + catch (Exception ex) when (swallowUnhandledExceptions && ex.IsCancellation()) + { + // Do nothing. + } + catch (Exception ex) when (!ex.IsCancellation() && logger.LogException(ex) && swallowUnhandledExceptions) + { + // Do nothing. The exception is logged in the when clause above. + } + + return defaultValue; + } + + internal static async ValueTask ExecuteWithCatchAsync( + this ILogger logger, + Func> action, + TArgument argument, + TResult? defaultValue = default, + bool swallowUnhandledExceptions = false) + { + try + { + return await action(argument).ConfigureAwait(false); + } + catch (Exception ex) when (swallowUnhandledExceptions && ex.IsCancellation()) + { + // Do nothing. + } + catch (Exception ex) when (!ex.IsCancellation() && logger.LogException(ex) && swallowUnhandledExceptions) + { + // Do nothing. The exception is logged in the when clause above. + } + + return defaultValue; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs new file mode 100644 index 00000000000..ea6bee6b18b --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs @@ -0,0 +1,393 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +///

+/// An base class that can be used to implement an AI-based . +///

+public abstract class ChatConversationEvaluator : IEvaluator +{ + /// + public abstract IReadOnlyCollection EvaluationMetricNames { get; } + + ///

+ /// Gets the that this uses when performing evaluations. + ///

+ protected virtual ChatOptions? ChatOptions => null; + + ///

+ /// Gets a value indicating whether this considers the entire conversation history (in + /// addition to the request and response being evaluated) as part of the evaluation it performs. + ///

+ /// + /// if this considers the entire conversation history as part of + /// the evaluation it performs; otherwise. + /// + protected abstract bool IgnoresHistory { get; } + + ///

+ /// Gets the system prompt that this uses when performing evaluations. + ///

+ protected virtual string? SystemPrompt => null; + + /// + public async ValueTask EvaluateAsync( + IEnumerable messages, + ChatMessage modelResponse, + ChatConfiguration? chatConfiguration = null, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(modelResponse, nameof(modelResponse)); + _ = Throw.IfNull(chatConfiguration, nameof(chatConfiguration)); + + EvaluationResult result = InitializeResult(); + + if (string.IsNullOrWhiteSpace(modelResponse.Text)) + { + result.AddDiagnosticToAllMetrics( + EvaluationDiagnostic.Error( + "Evaluation failed because the model response supplied for evaluation was null or empty.")); + + return result; + } + + (ChatMessage? userRequest, List history) = GetUserRequestAndHistory(messages); + + int inputTokenLimit = 0; + int ignoredMessagesCount = 0; + + if (chatConfiguration.TokenCounter is not null) + { + IEvaluationTokenCounter tokenCounter = chatConfiguration.TokenCounter; + inputTokenLimit = tokenCounter.InputTokenLimit; + int tokenBudget = inputTokenLimit; + + void OnTokenBudgetExceeded() + { + EvaluationDiagnostic tokenBudgetExceeded = + EvaluationDiagnostic.Error( + $"Evaluation failed because the specified limit of {inputTokenLimit} input tokens was exceeded."); + + result.AddDiagnosticToAllMetrics(tokenBudgetExceeded); + } + + if (!string.IsNullOrWhiteSpace(SystemPrompt)) + { + tokenBudget -= tokenCounter.CountTokens(SystemPrompt!); + if (tokenBudget < 0) + { + OnTokenBudgetExceeded(); + return result; + } + } + + string baseEvaluationPrompt = + await RenderEvaluationPromptAsync( + userRequest, + modelResponse, + includedHistory: [], + additionalContext, + cancellationToken).ConfigureAwait(false); + + tokenBudget -= tokenCounter.CountTokens(baseEvaluationPrompt); + if (tokenBudget < 0) + { + OnTokenBudgetExceeded(); + return result; + } + + if (history.Count > 0 && !IgnoresHistory) + { + if (history.Count == 1) + { + bool canRender = + await CanRenderAsync( + history[0], + ref tokenBudget, + chatConfiguration, + cancellationToken).ConfigureAwait(false); + + if (!canRender) + { + ignoredMessagesCount = 1; + history = []; + } + } + else + { + int totalMessagesCount = history.Count; + int includedMessagesCount = 0; + + history.Reverse(); + + foreach (ChatMessage message in history) + { + cancellationToken.ThrowIfCancellationRequested(); + + bool canRender = + await CanRenderAsync( + message, + ref tokenBudget, + chatConfiguration, + cancellationToken).ConfigureAwait(false); + + if (!canRender) + { + ignoredMessagesCount = totalMessagesCount - includedMessagesCount; + history.RemoveRange(index: includedMessagesCount, count: ignoredMessagesCount); + break; + } + + includedMessagesCount++; + } + + history.Reverse(); + } + } + } + + var evaluationMessages = new List(); + if (!string.IsNullOrWhiteSpace(SystemPrompt)) + { + evaluationMessages.Add(new ChatMessage(ChatRole.System, SystemPrompt!)); + } + + string evaluationPrompt = + await RenderEvaluationPromptAsync( + userRequest, + modelResponse, + includedHistory: history, + additionalContext, + cancellationToken).ConfigureAwait(false); + + evaluationMessages.Add(new ChatMessage(ChatRole.User, evaluationPrompt)); + + ChatResponse evaluationResponse = + await chatConfiguration.ChatClient.GetResponseAsync( + evaluationMessages, + ChatOptions, + cancellationToken: cancellationToken).ConfigureAwait(false); + + string? evaluationResponseContent = evaluationResponse.Message.Text; + + if (string.IsNullOrWhiteSpace(evaluationResponseContent)) + { + result.AddDiagnosticToAllMetrics( + EvaluationDiagnostic.Error( + "Evaluation failed because the model failed to produce a valid evaluation response.")); + } + else + { + await ParseEvaluationResponseAsync( + evaluationResponseContent!, + result, + chatConfiguration, + cancellationToken).ConfigureAwait(false); + } + + if (inputTokenLimit > 0 && ignoredMessagesCount > 0) + { +#pragma warning disable S103 // Lines should not be too long + result.AddDiagnosticToAllMetrics( + EvaluationDiagnostic.Warning( + $"The evaluation may be inconclusive because the oldest {ignoredMessagesCount} messages in the supplied conversation history were ignored in order to stay under the specified limit of {inputTokenLimit} input tokens.")); +#pragma warning restore S103 + } + + return result; + } + + ///

+ /// Determines if there is sufficient remaining to render the + /// supplied as part of the evaluation prompt that this uses. + ///

+ /// + /// A message that is part of the conversation history for the response being evaluated and that is to be rendered + /// as part of the evaluation prompt. + /// + /// + /// The remaining number of tokens available for the rendering additional content as part of the evaluation prompt. + /// + /// + /// A that specifies the and the + /// that this uses to perform the evaluation. + /// + /// A that can cancel the operation. + /// + /// if there is sufficient remaining to render the supplied + /// as part of the evaluation prompt; otherwise. + /// + protected virtual ValueTask CanRenderAsync( + ChatMessage message, + ref int tokenBudget, + ChatConfiguration chatConfiguration, + CancellationToken cancellationToken) + { + _ = Throw.IfNull(message, nameof(message)); + _ = Throw.IfNull(chatConfiguration, nameof(chatConfiguration)); + + IEvaluationTokenCounter? tokenCounter = chatConfiguration.TokenCounter; + if (tokenCounter is null) + { + return new ValueTask(true); + } + + string? author = message.AuthorName; + string role = message.Role.Value; + string content = message.Text ?? string.Empty; + + int tokenCount = + string.IsNullOrWhiteSpace(author) + ? tokenCounter.CountTokens("[") + + tokenCounter.CountTokens(role) + + tokenCounter.CountTokens("] ") + + tokenCounter.CountTokens(content) + + tokenCounter.CountTokens("\n") + : tokenCounter.CountTokens("[") + + tokenCounter.CountTokens(author!) + + tokenCounter.CountTokens(" (") + + tokenCounter.CountTokens(role) + + tokenCounter.CountTokens(")] ") + + tokenCounter.CountTokens(content) + + tokenCounter.CountTokens("\n"); + + if (tokenCount > tokenBudget) + { + return new ValueTask(false); + } + else + { + tokenBudget -= tokenCount; + return new ValueTask(true); + } + } + + ///

+ /// Renders the supplied to a string that can be included as part of the evaluation + /// prompt that this uses. + ///

+ /// + /// A message that is part of the conversation history for the response being evaluated and that is to be rendered + /// as part of the evaluation prompt. + /// + /// A that can cancel the operation. + /// + /// A string representation of the supplied that can be included as part of the + /// evaluation prompt. + /// + protected virtual ValueTask RenderAsync(ChatMessage message, CancellationToken cancellationToken) + { + _ = Throw.IfNull(message, nameof(message)); + + string? author = message.AuthorName; + string role = message.Role.Value; + string? content = message.Text; + + return string.IsNullOrWhiteSpace(author) + ? new ValueTask($"[{role}] {content}\n") + : new ValueTask($"[{author} ({role})] {content}\n"); + } + + ///

+ /// Renders the information present in the supplied parameters into a prompt that this + /// uses to perform the evaluation. + ///

+ /// + /// The request that produced the that is to be evaluated. + /// + /// The response that is to be evaluated. + /// + /// The conversation history (excluding the and ) + /// that is to be included as part of the evaluation prompt. + /// + /// + /// Additional contextual information (beyond that which is available in the and + /// ) that this may need to accurately evaluate the + /// supplied . + /// + /// A that can cancel the operation. + /// The evaluation prompt. + protected abstract ValueTask RenderEvaluationPromptAsync( + ChatMessage? userRequest, + ChatMessage modelResponse, + IEnumerable? includedHistory, + IEnumerable? additionalContext, + CancellationToken cancellationToken); + + ///

+ /// Returns an that includes default values for all the + /// s supported by this . + ///

+ /// + /// The s of the s contained in the returned + /// should match . + /// + /// + /// An that includes default values for all the + /// s supported by this . + /// + protected abstract EvaluationResult InitializeResult(); + + ///

+ /// Parses the evaluation result present in into the + /// s present in the supplied . + ///

+ /// + /// An AI-generated response that contains the result of the current evaluation. + /// + /// + /// An that includes a collection of s that are + /// supported by this . + /// + /// + /// A that specifies the and the + /// that this uses to perform the evaluation. + /// + /// A that can cancel the operation. + /// A that represents the asynchronous operation. + protected abstract ValueTask ParseEvaluationResponseAsync( + string modelResponseForEvaluationPrompt, + EvaluationResult result, + ChatConfiguration chatConfiguration, + CancellationToken cancellationToken); + + private (ChatMessage? userRequest, List history) GetUserRequestAndHistory( + IEnumerable messages) + { + ChatMessage? userRequest = null; + List history; + + if (IgnoresHistory) + { + userRequest = + messages.LastOrDefault() is ChatMessage lastMessage && lastMessage.Role == ChatRole.User + ? lastMessage + : null; + + history = []; + } + else + { + history = [.. messages]; + int lastMessageIndex = history.Count - 1; + + if (lastMessageIndex >= 0 && + history[lastMessageIndex] is ChatMessage lastMessage && + lastMessage.Role == ChatRole.User) + { + userRequest = lastMessage; + history.RemoveAt(lastMessageIndex); + } + } + + return (userRequest, history); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/CoherenceEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/CoherenceEvaluator.cs new file mode 100644 index 00000000000..8c31feb2dde --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/CoherenceEvaluator.cs @@ -0,0 +1,93 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +///

+/// An that evaluates the 'Coherence' of a response produced by an AI model. +///

+/// +/// returns a that contains a score for 'Coherence'. The +/// score is a number between 1 and 5, with 1 indicating a poor score, and 5 indicating an excellent score. +/// +public sealed class CoherenceEvaluator : SingleNumericMetricEvaluator +{ + ///

+ /// Gets the of the returned by + /// . + ///

+ public static string CoherenceMetricName => "Coherence"; + + /// + protected override string MetricName => CoherenceMetricName; + + /// + protected override bool IgnoresHistory => true; + + /// + protected override async ValueTask RenderEvaluationPromptAsync( + ChatMessage? userRequest, + ChatMessage modelResponse, + IEnumerable? includedHistory, + IEnumerable? additionalContext, + CancellationToken cancellationToken) + { + string renderedModelResponse = await RenderAsync(modelResponse, cancellationToken).ConfigureAwait(false); + + string renderedUserRequest = + userRequest is not null + ? await RenderAsync(userRequest, cancellationToken).ConfigureAwait(false) + : string.Empty; + + string prompt = + $$""" + Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a + whole. Consider the overall quality of the answer when evaluating coherence. + + Given the question and answer, score the coherence of the answer between one to five stars using the + following rating scale: + One star: the answer completely lacks coherence + Two stars: the answer mostly lacks coherence + Three stars: the answer is partially coherent + Four stars: the answer is mostly coherent + Five stars: the answer has perfect coherency + + The rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 + or 4 or 5. + + question: What is your favorite indoor activity and why do you enjoy it? + answer: I like pizza. The sun is shining. + stars: 1 + + question: Can you describe your favorite movie without giving away any spoilers? + answer: It is a science fiction movie. There are dinosaurs. The actors eat cake. People must stop the + villain. + stars: 2 + + question: What are some benefits of regular exercise? + answer: Regular exercise improves your mood. A good workout also helps you sleep better. Trees are green. + stars: 3 + + question: How do you cope with stress in your daily life? + answer: I usually go for a walk to clear my head. Listening to music helps me relax as well. Stress is a + part of life, but we can manage it through some activities. + stars: 4 + + question: What can you tell me about climate change and its effects on the environment? + answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the + melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather + events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike. + stars: 5 + + question: {{renderedUserRequest}} + answer: {{renderedModelResponse}} + stars: + """; + + return prompt; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs new file mode 100644 index 00000000000..ed482688e0c --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs @@ -0,0 +1,133 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +///

+/// An that evaluates the 'Equivalence' of a response produced by an AI model. +///

+/// +/// The measures the degree to which the response being evaluated is similar to the +/// response supplied via . It returns a +/// that contains a score for the 'Equivalence'. The score is a number between 1 and 5, +/// with 1 indicating a poor score, and 5 indicating an excellent score. +/// +public sealed class EquivalenceEvaluator : SingleNumericMetricEvaluator +{ + ///

+ /// Gets the of the returned by + /// . + ///

+ public static string EquivalenceMetricName => "Equivalence"; + + /// + protected override string MetricName => EquivalenceMetricName; + + /// + protected override bool IgnoresHistory => true; + + /// + protected override async ValueTask RenderEvaluationPromptAsync( + ChatMessage? userRequest, + ChatMessage modelResponse, + IEnumerable? includedHistory, + IEnumerable? additionalContext, + CancellationToken cancellationToken) + { + string renderedModelResponse = await RenderAsync(modelResponse, cancellationToken).ConfigureAwait(false); + + string renderedUserRequest = + userRequest is not null + ? await RenderAsync(userRequest, cancellationToken).ConfigureAwait(false) + : string.Empty; + + string groundTruth; + + if (additionalContext?.OfType().FirstOrDefault() + is EquivalenceEvaluatorContext context) + { + groundTruth = context.GroundTruth; + } + else + { + throw new InvalidOperationException( + $"A value of type '{nameof(EquivalenceEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."); + } + + string prompt = + $$""" + Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If + the information and content in the predicted answer is similar or equivalent to the correct answer, then + the value of the Equivalence metric should be high, else it should be low. + + Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using + the following rating scale: + One star: the predicted answer is not at all similar to the correct answer + Two stars: the predicted answer is mostly not similar to the correct answer + Three stars: the predicted answer is somewhat similar to the correct answer + Four stars: the predicted answer is mostly similar to the correct answer + Five stars: the predicted answer is completely similar to the correct answer + + The rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 + or 4 or 5. + + The examples below show the Equivalence score for a question, a correct answer, and a predicted answer. + + question: What is the role of ribosomes? + correct answer: Ribosomes are cellular structures responsible for protein synthesis. They interpret the + genetic information carried by messenger RNA (mRNA) and use it to assemble amino acids into proteins. + predicted answer: Ribosomes participate in carbohydrate breakdown by removing nutrients from complex sugar + molecules. + stars: 1 + + question: Why did the Titanic sink? + correct answer: The Titanic sank after it struck an iceberg during its maiden voyage in 1912. The impact + caused the ship's hull to breach, allowing water to flood into the vessel. The ship's design, lifeboat + shortage, and lack of timely rescue efforts contributed to the tragic loss of life. + predicted answer: The sinking of the Titanic was a result of a large iceberg collision. This caused the + ship to take on water and eventually sink, leading to the death of many passengers due to a shortage of + lifeboats and insufficient rescue attempts. + stars: 2 + + question: What causes seasons on Earth? + correct answer: Seasons on Earth are caused by the tilt of the Earth's axis and its revolution around the + Sun. As the Earth orbits the Sun, the tilt causes different parts of the planet to receive varying amounts + of sunlight, resulting in changes in temperature and weather patterns. + predicted answer: Seasons occur because of the Earth's rotation and its elliptical orbit around the Sun. + The tilt of the Earth's axis causes regions to be subjected to different sunlight intensities, which leads + to temperature fluctuations and alternating weather conditions. + stars: 3 + + question: How does photosynthesis work? + correct answer: Photosynthesis is a process by which green plants and some other organisms convert light + energy into chemical energy. This occurs as light is absorbed by chlorophyll molecules, and then carbon + dioxide and water are converted into glucose and oxygen through a series of reactions. + predicted answer: In photosynthesis, sunlight is transformed into nutrients by plants and certain + microorganisms. Light is captured by chlorophyll molecules, followed by the conversion of carbon dioxide + and water into sugar and oxygen through multiple reactions. + stars: 4 + + question: What are the health benefits of regular exercise? + correct answer: Regular exercise can help maintain a healthy weight, increase muscle and bone strength, and + reduce the risk of chronic diseases. It also promotes mental well-being by reducing stress and improving + overall mood. + predicted answer: Routine physical activity can contribute to maintaining ideal body weight, enhancing + muscle and bone strength, and preventing chronic illnesses. In addition, it supports mental health by + alleviating stress and augmenting general mood. + stars: 5 + + question: {{renderedUserRequest}} + correct answer:{{groundTruth}} + predicted answer: {{renderedModelResponse}} + stars: + """; + + return prompt; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluatorContext.cs new file mode 100644 index 00000000000..7da9518ebbd --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluatorContext.cs @@ -0,0 +1,31 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable S3604 +// S3604: Member initializer values should not be redundant. +// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary +// constructor syntax. + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +///

+/// Contextual information required to evaluate the 'Equivalence' of a response. +///

+/// +/// The ground truth response against which the response that is being evaluated is compared. +/// +/// +/// The measures the degree to which the response being evaluated is similar to the +/// response supplied via . +/// +public sealed class EquivalenceEvaluatorContext(string groundTruth) : EvaluationContext +{ + ///

+ /// Gets the ground truth response against which the response that is being evaluated is compared. + ///

+ /// + /// The measures the degree to which the response being evaluated is similar to + /// the response supplied via . + /// + public string GroundTruth { get; } = groundTruth; +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs new file mode 100644 index 00000000000..35ec42837bc --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EvaluationMetricExtensions.cs @@ -0,0 +1,31 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +internal static class EvaluationMetricExtensions +{ + internal static EvaluationMetricInterpretation InterpretScore(this NumericMetric metric) + { + EvaluationRating rating = metric.Value switch + { + null => EvaluationRating.Inconclusive, + > 5.0 => EvaluationRating.Inconclusive, + > 4.0 and <= 5.0 => EvaluationRating.Exceptional, + > 3.0 and <= 4.0 => EvaluationRating.Good, + > 2.0 and <= 3.0 => EvaluationRating.Average, + > 1.0 and <= 2.0 => EvaluationRating.Poor, + > 0.0 and <= 1.0 => EvaluationRating.Unacceptable, + <= 0.0 => EvaluationRating.Inconclusive, + _ => EvaluationRating.Inconclusive, + }; + + const double MinimumPassingScore = 4.0; + return metric.Value is double value && value < MinimumPassingScore + ? new EvaluationMetricInterpretation( + rating, + failed: true, + reason: $"{metric.Name} is less than {MinimumPassingScore}.") + : new EvaluationMetricInterpretation(rating); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/FluencyEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/FluencyEvaluator.cs new file mode 100644 index 00000000000..8c11cf0f0c0 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/FluencyEvaluator.cs @@ -0,0 +1,92 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +///

+/// An that evaluates the 'Fluency' of a response produced by an AI model. +///

+/// +/// returns a that contains a score for 'Fluency'. The score +/// is a number between 1 and 5, with 1 indicating a poor score, and 5 indicating an excellent score. +/// +public sealed class FluencyEvaluator : SingleNumericMetricEvaluator +{ + ///

+ /// Gets the of the returned by + /// . + ///

+ public static string FluencyMetricName => "Fluency"; + + /// + protected override string MetricName => FluencyMetricName; + + /// + protected override bool IgnoresHistory => true; + + /// + protected override async ValueTask RenderEvaluationPromptAsync( + ChatMessage? userRequest, + ChatMessage modelResponse, + IEnumerable? includedHistory, + IEnumerable? additionalContext, + CancellationToken cancellationToken) + { + string renderedModelResponse = await RenderAsync(modelResponse, cancellationToken).ConfigureAwait(false); + + string renderedUserRequest = + userRequest is not null + ? await RenderAsync(userRequest, cancellationToken).ConfigureAwait(false) + : string.Empty; + + string prompt = + $$""" + Fluency measures the quality of individual sentences in the answer, and whether they are well-written and + grammatically correct. Consider the quality of individual sentences when evaluating fluency. + + Given the question and answer, score the fluency of the answer between one to five stars using the + following rating scale: + One star: the answer completely lacks fluency + Two stars: the answer mostly lacks fluency + Three stars: the answer is partially fluent + Four stars: the answer is mostly fluent + Five stars: the answer has perfect fluency + + The rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 + or 4 or 5. + + question: What did you have for breakfast today? + answer: Breakfast today, me eating cereal and orange juice very good. + stars: 1 + + question: How do you feel when you travel alone? + answer: Alone travel, nervous, but excited also. I feel adventure and like its time. + stars: 2 + + question: When was the last time you went on a family vacation? + answer: Last family vacation, it took place in last summer. We traveled to a beach destination, very fun. + stars: 3 + + question: What is your favorite thing about your job? + answer: My favorite aspect of my job is the chance to interact with diverse people. I am constantly + learning from their experiences and stories. + stars: 4 + + question: Can you describe your morning routine? + answer: Every morning, I wake up at 6 am, drink a glass of water, and do some light stretching. After that, + I take a shower and get dressed for work. Then, I have a healthy breakfast, usually consisting of oatmeal + and fruits, before leaving the house around 7:30 am. + stars: 5 + + question: {{renderedUserRequest}} + answer: {{renderedModelResponse}} + stars: + """; + + return prompt; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs new file mode 100644 index 00000000000..ddb3d522a44 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs @@ -0,0 +1,146 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +///

+/// An that evaluates the 'Groundedness' of a response produced by an AI model. +///

+/// +/// The measures the degree to which the response being evaluated is grounded in +/// the information present in the supplied . It returns a +/// that contains a score for the 'Groundedness'. The score is a number between 1 and 5, +/// with 1 indicating a poor score, and 5 indicating an excellent score. +/// +public sealed class GroundednessEvaluator : SingleNumericMetricEvaluator +{ + ///

+ /// Gets the of the returned by + /// . + ///

+ public static string GroundednessMetricName => "Groundedness"; + + /// + protected override string MetricName => GroundednessMetricName; + + /// + protected override bool IgnoresHistory => false; + + /// + protected override async ValueTask RenderEvaluationPromptAsync( + ChatMessage? userRequest, + ChatMessage modelResponse, + IEnumerable? includedHistory, + IEnumerable? additionalContext, + CancellationToken cancellationToken) + { + string renderedModelResponse = await RenderAsync(modelResponse, cancellationToken).ConfigureAwait(false); + + string renderedUserRequest = + userRequest is not null + ? await RenderAsync(userRequest, cancellationToken).ConfigureAwait(false) + : string.Empty; + + var builder = new StringBuilder(); + + if (additionalContext?.OfType().FirstOrDefault() + is GroundednessEvaluatorContext context) + { + _ = builder.Append(context.GroundingContext); + _ = builder.AppendLine(); + _ = builder.AppendLine(); + } + + if (includedHistory is not null) + { + foreach (ChatMessage message in includedHistory) + { + _ = builder.Append(await RenderAsync(message, cancellationToken).ConfigureAwait(false)); + } + } + + string renderedContext = builder.ToString(); + + string prompt = + $$""" + You will be presented with a QUESTION, and an ANSWER to the QUESTION along with some CONTEXT (which may + include some conversation history). Groundedness of the ANSWER is measured by how well it logically follows + from the information supplied via the CONTEXT and / or QUESTION. + + Score the groundedness of the ANSWER between one to five stars using the following rating scale: + One star: the ANSWER is not at all grounded and is logically false based on the supplied info. + Two stars: most parts of the ANSWER are not grounded and do not follow logically from the supplied info. + Three stars: some parts of the ANSWER are grounded in the supplied info, other parts are not. + Four stars: most parts of the ANSWER are grounded and follow logically from the supplied info. + Five stars: the ANSWER is perfectly grounded and follows logically from the supplied info. + + If it is not possible to determine whether the ANSWER is logically true or false based on the supplied + info, score the ANSWER as one star. + + Read the supplied QUESTION, ANSWER and CONTEXT thoroughly and select the correct rating based on the above + criteria. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. (Note that the ANSWER is + generated by a computer system and can contain certain symbols. This should not be a negative factor in the + evaluation.) + + The rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 + or 4 or 5. + + Independent Examples: + ## Example Task #1 Input: + ----- + CONTEXT: Some are reported as not having been wanted at all. + ----- + QUESTION: + ----- + ANSWER: All are reported as being completely and fully wanted. + ----- + ## Example Task #1 Output: + 1 + + ## Example Task #2 Input: + ----- + CONTEXT: Ten new television shows appeared during the month of September. Five of the shows were sitcoms, + three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows + were still on the air. Five of the shows that remained were sitcoms. + ----- + QUESTION: Were there any hourlong shows amongst the shows that were cancelled?, + ----- + ANSWER: At least one of the shows that were cancelled was an hourlong drama. + ----- + ## Example Task #2 Output: + 5 + + ## Example Task #3 Input: + ----- + CONTEXT: In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language + is neither French nor English. + ----- + QUESTION: What does the term allophone mean? + ----- + ANSWER: In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language + is not French. + ----- + ## Example Task #3 Output: + 5 + + ## Actual Task Input: + ----- + CONTEXT: {{renderedContext}} + ----- + QUESTION: {{renderedUserRequest}} + ----- + ANSWER: {{renderedModelResponse}} + ----- + + ## Actual Task Output: + """; + + return prompt; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluatorContext.cs new file mode 100644 index 00000000000..7223640f8d4 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluatorContext.cs @@ -0,0 +1,31 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable S3604 +// S3604: Member initializer values should not be redundant. +// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary +// constructor syntax. + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +///

+/// Contextual information required to evaluate the 'Groundedness' of a response. +///

+/// +/// Contextual information against which the 'Groundedness' of a response is evaluated. +/// +/// +/// The measures the degree to which the response being evaluated is grounded in +/// the information present in the supplied . +/// +public sealed class GroundednessEvaluatorContext(string groundingContext) : EvaluationContext +{ + ///

+ /// Gets the contextual information against which the 'Groundedness' of a response is evaluated. + ///

+ /// + /// The measures the degree to which the response being evaluated is grounded + /// in the information present in the supplied . + /// + public string GroundingContext { get; } = groundingContext; +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/Microsoft.Extensions.AI.Evaluation.Quality.csproj b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/Microsoft.Extensions.AI.Evaluation.Quality.csproj new file mode 100644 index 00000000000..9ca34e28d5d --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/Microsoft.Extensions.AI.Evaluation.Quality.csproj @@ -0,0 +1,26 @@ + + + + A library containing a set of evaluators for evaluating the quality (coherence, relevance, truth, completeness, groundedness, fluency, equivalence etc.) of responses received from an LLM. + $(TargetFrameworks);netstandard2.0 + Microsoft.Extensions.AI.Evaluation.Quality + + + + AIEval + preview + true + false + 88 + 0 + + + + + + + + + + + diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/README.md new file mode 100644 index 00000000000..09345b5e58c --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/README.md @@ -0,0 +1,46 @@ +# The Microsoft.Extensions.AI.Evaluation libraries + +`Microsoft.Extensions.AI.Evaluation` is a set of .NET libraries defined in the following NuGet packages that have been designed to work together to support building processes for evaluating the quality of AI software. + +* [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation. +* [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Equivalence and Groundedness. +* [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. +* [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. +* [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. + +## Install the packages + +From the command-line: + +```console +dotnet add package Microsoft.Extensions.AI.Evaluation +dotnet add package Microsoft.Extensions.AI.Evaluation.Quality +dotnet add package Microsoft.Extensions.AI.Evaluation.Reporting +``` + +Or directly in the C# project file: + +```xml + + + + + +``` + +You can optionally add the `Microsoft.Extensions.AI.Evaluation.Reporting.Azure` package in either of these places if you need Azure Storage support. + +## Install the command line tool + +```console +dotnet tool install Microsoft.Extensions.AI.Evaluation.Console --create-manifest-if-needed +``` + +## Usage Examples + +For a comprehensive tour of all the functionality, concepts and APIs available in the `Microsoft.Extensions.AI.Evaluation` libraries, check out the [API Usage Examples](https://github.com/dotnet/ai-samples/blob/main/src/microsoft-extensions-ai-evaluation/api/) available in the [dotnet/ai-samples](https://github.com/dotnet/ai-samples) repo. These examples are structured as a collection of unit tests. Each unit test showcases a specific concept or API, and builds on the concepts and APIs showcased in previous unit tests. + + +## Feedback & Contributing + +We welcome feedback and contributions in [our GitHub repo](https://github.com/dotnet/extensions). diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Prompts.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Prompts.cs new file mode 100644 index 00000000000..91d89d65531 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Prompts.cs @@ -0,0 +1,224 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +public partial class RelevanceTruthAndCompletenessEvaluator +{ + private static class Prompts + { + internal static string BuildEvaluationPrompt(string userQuery, string modelResponse, string history) + { +#pragma warning disable S103 // Lines should not be too long + return + $$""" + Read the History, User Query, and Model Response below and produce your response as a single JSON object. + Do not include any other text in your response besides the JSON object. + + The JSON object should have the following format. However, do not include any markdown tags in your + response. Your response should start with an open curly brace and end with a closing curly brace for the + JSON. + ``` + { + "relevance": 1, + "truth": 1, + "completeness": 1 + } + ``` + + ----- + + History: {{history}} + + ----- + + User Query: {{userQuery}} + + ----- + + Model Response: {{modelResponse}} + + ----- + + That's the History, User Query, and Model Response you will rate. Now, in 3 Steps, you will evaluate the Model Response on 3 criteria. + + ----- + + Step 1: Rate the relevance of the response. + + Regardless of truth of statements, how much of the response is directly related to the request? + + 1 = None of the response is at all related + 2 = A little of the response is directly related, or the response is a little bit related + 3 = Some of the response is related, or the response is somewhat related + 4 = Most of the response is related, or the response is mostly related + 5 = 100% of the response is 100% related to the request + + Record your response as the value of the "relevance" property in the JSON output. + + ----- + + Step 2: Rate the truth of the response. + + Read the History, Query, and Model Response again. + + Regardless of relevance, how true are the verifiable statements in the response? + + 1 = The entire response is totally false + 2 = A little of the response is true, or the response is a little bit true + 3 = Some of the response is true, or the response is somewhat true + 4 = Most of the response is true, or the response is mostly true + 5 = 100% of the response is 100% true + + Record your response as the value of the "truth" property in the JSON output. + + ----- + + Step 3: Rate the completeness of the response. + + Read the History, Query, and Model Response again. + + Regardless of whether the statements made in the response are true, how many of the points necessary to address the request, does the response contain? + + 1 = The response omits all points that are necessary to address the request. + 2 = The response includes a little of the points that are necessary to address the request. + 3 = The response includes some of the points that are necessary to address the request. + 4 = The response includes most of the points that are necessary to address the request. + 5 = The response includes all points that are necessary to address the request. For explain tasks, nothing is left unexplained. For improve tasks, I looked for all potential improvements, and none were left out. For fix tasks, the response purports to get the user all the way to a fixed state (regardless of whether it actually works). For "do task" responses, it does everything requested. + + Record your response as the value of the "completeness" property in the JSON output. + + ----- + """; +#pragma warning restore S103 + } + + internal static string BuildEvaluationPromptWithReasoning( + string userQuery, + string modelResponse, + string history) + { +#pragma warning disable S103 // Lines should not be too long + return + $$""" + Read the History, User Query, and Model Response below and produce your response as a single JSON object. + Do not include any other text in your response besides the JSON object. Make sure the response is a valid + JSON object. + + The JSON object should have the following format. However, do not include any markdown tags in your + response. Your response should start with an open curly brace and end with a closing curly brace for the + JSON. + ``` + { + "relevance": 1, + "relevanceReasoning": "The reason for the relevance score", + "relevanceReasons": ["relevance_reason_distant_topic"], + "truth": 1, + "truthReasoning": "The reason for the truth score", + "truthReasons": ["truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent"], + "completeness": 1, + "completenessReasoning": "The reason for the completeness score", + "completenessReasons": ["completeness_reason_no_solution", "completeness_reason_genericsolution_missingcode"], + } + ``` + + ----- + + History: {{history}} + + ----- + + User Query: {{userQuery}} + + ----- + + Model Response: {{modelResponse}} + + ----- + + That's the History, User Query, and Model Response you will rate. Now, in 3 Steps, you will evaluate the Model Response on 3 criteria. + + ----- + + Step 1: Rate the relevance of the response. + + Regardless of truth of statements, how much of the response is directly related to the request? + + 1 = None of the response is at all related + 2 = A little of the response is directly related, or the response is a little bit related + 3 = Some of the response is related, or the response is somewhat related + 4 = Most of the response is related, or the response is mostly related + 5 = 100% of the response is 100% related to the request + + Step 1a. Record your response as the value of the "relevance" property in the JSON output. + + Step 1b. Now, record your reasoning for that score as the value of the "relevanceReasoning" property. If the score is 5, record the value of "relevanceReasoning" as "100% of the response is 100% related to the request". If the score is not 5, write your Reasoning as fewer than 100 words and prioritizing the most important contributing reasons to the score. + + Step 1c. Now, record whether your relevance reasoning includes each of these categories, as an array of strings for the value of the "relevanceReasons" property. The possible values for this array are "relevance_reason_distant_topic", "relevance_reason_scope_too_small", "relevance_reason_scope_too_large", "relevance_reason_wrong_intent_served", or "relevance_reason_other". If the score is 5, the array should be empty. + - relevance_reason_distant_topic. In a taxonomy of programming-related topics, the response is a non-zero distance from the user request. Example: User: "How can I implement a binary search in Python?" Model Response: "HTML (HyperText Markup Language) is the standard markup language for web pages." + - relevance_reason_scope_too_small. Scope or level of abstraction is smaller than user intent. Example: User: "Can you explain object-oriented programming?" Model Response: "In object-oriented programming, a class is a blueprint for creating objects." + - relevance_reason_scope_too_large. Scope or level of abstraction is larger than user intent. + Example: User: "Why am I getting a null pointer exception here?" Model Response: "A Null Pointer Exception in Java occurs when you try to use a reference that points to no location in memory (null) as though it were referencing an object." + - relevance_reason_wrong_intent_served. Alignment with inferred or explicit intent, or in absence of that, general standards. Example: User: "convert a JSON object to a Python dictionary" Model Response: "JSON is a lightweight data-interchange format that is easy for humans to read and write and easy for machines to parse and generate." (The user intends to write code and the model is serving an intent to explain a concept) + - relevance_reason_other. Any other reasoning. + + ----- + + Step 2: Rate the truth of the response. + + Read the History, Query, and Model Response again. + + Regardless of relevance, how true are the verifiable statements in the response? + + 1 = The entire response is totally false + 2 = A little of the response is true, or the response is a little bit true + 3 = Some of the response is true, or the response is somewhat true + 4 = Most of the response is true, or the response is mostly true + 5 = 100% of the response is 100% true + + Step 2a. Record your response as the value of the "truth" property in the JSON output. + + Step 2b. Now, record your reasoning for that score as the value of the "truthReasoning" property. If the score is 5, record the value of "truthReasoning" as "100% of the response is 100% true". If the score is not 5, write your Reasoning as fewer than 100 words and prioritizing the most important contributing reasons to the score. + + Step 2c. Now, record whether your truth reasoning includes each of these categories, as an array of strings for the value of the "truthReasons" property. The possible values for this array are "truth_reason_incorrect_information", "truth_reason_outdated_information", "truth_reason_misleading_incorrectforintent", or "truth_reason_other". If the score is 5, the array should be empty. + - truth_reason_incorrect_information. The response contains information that is factually incorrect. Example: User: "What is the time complexity of quicksort?" Model Response: "Quicksort has a time complexity of O(n)." + - truth_reason_outdated_information. The response contains information that was once true but is no longer true. Example: User: "How do I install Python 2?" Model Response: "You can install Python 2 using the command sudo apt-get install python." + - truth_reason_misleading_incorrectforintent. The response is true but irrelevant to the user's intent, causing results that are incorrect for the user's context. User: "How do I sort a list in Python?" Model Response: "You can use the sorted() function to sort a list in Python." (sorted() returns a new sorted list, leaving the original list unchanged. If the user's intent was to sort the original list, they should use list.sort().) + - truth_reason_other. any other reasoning. + + ----- + + Step 3: Rate the completeness of the response. + + Read the History, Query, and Model Response again. + + Regardless of whether the statements made in the response are true, how many of the points necessary to address the request, does the response contain? + + 1 = The response omits all points that are necessary to address the request. + 2 = The response includes a little of the points that are necessary to address the request. + 3 = The response includes some of the points that are necessary to address the request. + 4 = The response includes most of the points that are necessary to address the request. + 5 = The response includes all points that are necessary to address the request. For explain tasks, nothing is left unexplained. For improve tasks, I looked for all potential improvements, and none were left out. For fix tasks, the response purports to get the user all the way to a fixed state (regardless of whether it actually works). For "do task" responses, it does everything requested. + + Step 3a. Record your response as the value of the "completeness" property in the JSON output. + + Step 3b. Now, record your reasoning for that score as the value of the "completenessReasoning" property. If the score is 5, record the value of "completenessReasoning" as "The response includes all points that are necessary to address the request". If the score is not 5, write your Reasoning as fewer than 100 words and prioritizing the most important contributing reasons to the score. + + Step 3c. Now, record whether your completeness reasoning includes each of these categories, as an array of strings for the value of the "completenessReasons" property. The possible values for this array are "completeness_reason_no_solution", "completeness_reason_lacks_information_about_solution", "completeness_reason_genericsolution_missingcode", "completeness_reason_generic_code", "completeness_reason_failed_to_change_code", "completeness_reason_failed_to_change_code", "completeness_reason_incomplete_list", "completeness_reason_incomplete_code", "completeness_reason_missing_warnings", or "completeness_reason_other". If the score is 5, the array should be empty. + - completeness_reason_no_solution. The model response does not achieve or offer a solution to the user intent. Example 1: User: "How can I implement a binary search in Python?" Model Response: "Binary search is a search algorithm." Example 2: User: "How can I implement a binary search in Python?" Model Response: "500 error" + - completeness_reason_lacks_information_about_solution. The model response does not include enough information about its solution, such as why its solution is reasonable, or how it addresses the user intent. Example: User: "How can I reverse a string in Python?" Model Response: "Hello, World!"[::-1]" + - completeness_reason_genericsolution_missingcode. The user intends to generate code or get help writing code. The model response includes a response that solves the problem generically, but does not include code. Example: User: "How can I implement a binary search in Python?" Model Response: "You can implement a binary search by dividing the search space in half each time you fail to find the target value." + - completeness_reason_generic_code. The user intends to generate code or get help writing code that uses specific functions, names, or other components in their current code. The model response includes generic code, and does not modify or use components from the user's current code. Example: User: "How do I use my foo function?" Model Response: "Here's how you can use a function in Python: function_name()." + - completeness_reason_failed_to_change_code. The user intends to generate code or get help writing code, but the model response returns code that the user already has. + - completeness_reason_incomplete_list. Serving the user intent requires several natural language components, such as a description of some concept, or a list of system capabilities, reasons to use a particular approach, or problems with code, but the model response addresses fewer than all of the required components or misses parts of components. Example: User: "What are the steps to implement a binary search in Python?" Model Response: "The first step in implementing a binary search is to sort the array." + - completeness_reason_incomplete_code. Serving the user intent requires several code components, such as library imports, object creations and manipulations, and the model offers code, but the code offers fewer than all of the required components. Example: User: "How can I read a CSV file in Python?" Model response: "You can import the pandas library: `import pandas`." + - completeness_reason_lazy_unopinionated. The model claims not to know how, or not be certain enough, to address the user intent and does not offer the user any recourse (e.g., asking the user to be more specific, or offering potential subtopics for ambiguous user requests). Example: User: "compile error" Model response: "I can't help with that, I need more information." (The response doesn't offer any typical troubleshooting ideas based on the user's code, context, or general programming knowledge.) + - completeness_reason_missing_warnings. The response has potential pitfalls or dangers, but does not warn the user about them. Example: User: "How do I delete a file in Python?" Model Response: "You can use os.remove()." (This response should warn the user that this operation is irreversible and should be done with caution.) + - completeness_reason_other. Any other reasoning. + + ----- + """; +#pragma warning restore S103 + } + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Rating.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Rating.cs new file mode 100644 index 00000000000..8ff913fefe7 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.Rating.cs @@ -0,0 +1,72 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Text.Json; +using System.Text.Json.Serialization; +using Microsoft.Extensions.AI.Evaluation.Quality.Utilities; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +public partial class RelevanceTruthAndCompletenessEvaluator +{ + internal sealed class Rating + { + public static Rating Inconclusive { get; } = new Rating(relevance: -1, truth: -1, completeness: -1); + + public int Relevance { get; } + public string? RelevanceReasoning { get; } + public string[] RelevanceReasons { get; } = []; + + public int Truth { get; } + public string? TruthReasoning { get; } + public string[] TruthReasons { get; } = []; + + public int Completeness { get; } + public string? CompletenessReasoning { get; } + public string[] CompletenessReasons { get; } = []; + + public string? Error { get; } + + private const int MinValue = 1; + private const int MaxValue = 5; + +#pragma warning disable S1067 // Expressions should not be too complex. + public bool IsInconclusive => + Error is not null || + Relevance < MinValue || Relevance > MaxValue || + Truth < MinValue || Truth > MaxValue || + Completeness < MinValue || Completeness > MaxValue; +#pragma warning restore S1067 + + public Rating(int relevance, int truth, int completeness, string? error = null) + { + (Relevance, Truth, Completeness, Error) = (relevance, truth, completeness, error); + } + + [JsonConstructor] +#pragma warning disable S107 // Methods should not have too many parameters. + public Rating( + int relevance, string? relevanceReasoning, string[] relevanceReasons, + int truth, string? truthReasoning, string[] truthReasons, + int completeness, string? completenessReasoning, string[] completenessReasons, + string? error = null) +#pragma warning restore S107 + { + (Relevance, RelevanceReasoning, RelevanceReasons, + Truth, TruthReasoning, TruthReasons, + Completeness, CompletenessReasoning, CompletenessReasons, + Error) = + (relevance, relevanceReasoning, relevanceReasons ?? [], + truth, truthReasoning, truthReasons ?? [], + completeness, completenessReasoning, completenessReasons ?? [], + error); + } + + public static Rating FromJson(string jsonResponse) + { + ReadOnlySpan trimmed = JsonOutputFixer.TrimMarkdownDelimiters(jsonResponse); + return JsonSerializer.Deserialize(trimmed, SerializerContext.Default.Rating)!; + } + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.SerializerContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.SerializerContext.cs new file mode 100644 index 00000000000..211213d4873 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.SerializerContext.cs @@ -0,0 +1,16 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Text.Json.Serialization; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +public partial class RelevanceTruthAndCompletenessEvaluator +{ + [JsonSourceGenerationOptions( + WriteIndented = true, + AllowTrailingCommas = true, + PropertyNamingPolicy = JsonKnownNamingPolicy.CamelCase)] + [JsonSerializable(typeof(Rating))] + internal sealed partial class SerializerContext : JsonSerializerContext; +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs new file mode 100644 index 00000000000..54c08f87d58 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs @@ -0,0 +1,181 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable S3604 +// S3604: Member initializer values should not be redundant. +// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary +// constructor syntax. + +using System.Collections.Generic; +using System.Text; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Quality.Utilities; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +///

+/// An that evaluates the 'Relevance', 'Truth' and 'Completeness' of a response produced by an +/// AI model. +///

+/// +/// returns three s that contain scores +/// for 'Relevance', 'Truth' and 'Completeness' respectively. Each score is a number between 1 and 5, with 1 indicating +/// a poor score, and 5 indicating an excellent score. +/// +/// Options for . +public sealed partial class RelevanceTruthAndCompletenessEvaluator( + RelevanceTruthAndCompletenessEvaluatorOptions? options = null) : ChatConversationEvaluator +{ + ///

+ /// Gets the of the returned by + /// for 'Relevance'. + ///

+ public static string RelevanceMetricName => "Relevance"; + + ///

+ /// Gets the of the returned by + /// for 'Truth'. + ///

+ public static string TruthMetricName => "Truth"; + + ///

+ /// Gets the of the returned by + /// for 'Completeness'. + ///

+ public static string CompletenessMetricName => "Completeness"; + + /// + public override IReadOnlyCollection EvaluationMetricNames { get; } = + [RelevanceMetricName, TruthMetricName, CompletenessMetricName]; + + /// + protected override ChatOptions? ChatOptions { get; } = + new ChatOptions + { + Temperature = 0.0f, + ResponseFormat = ChatResponseFormat.Json + }; + + private readonly RelevanceTruthAndCompletenessEvaluatorOptions _options = + options ?? RelevanceTruthAndCompletenessEvaluatorOptions.Default; + + /// + protected override bool IgnoresHistory => false; + + /// + protected override EvaluationResult InitializeResult() + { + var relevance = new NumericMetric(RelevanceMetricName); + var truth = new NumericMetric(TruthMetricName); + var completeness = new NumericMetric(CompletenessMetricName); + return new EvaluationResult(relevance, truth, completeness); + } + + /// + protected override async ValueTask RenderEvaluationPromptAsync( + ChatMessage? userRequest, + ChatMessage modelResponse, + IEnumerable? includedHistory, + IEnumerable? additionalContext, + CancellationToken cancellationToken) + { + string renderedModelResponse = await RenderAsync(modelResponse, cancellationToken).ConfigureAwait(false); + + string renderedUserRequest = + userRequest is not null + ? await RenderAsync(userRequest, cancellationToken).ConfigureAwait(false) + : string.Empty; + + var builder = new StringBuilder(); + if (includedHistory is not null) + { + foreach (ChatMessage message in includedHistory) + { + _ = builder.Append(await RenderAsync(message, cancellationToken).ConfigureAwait(false)); + } + } + + string renderedHistory = builder.ToString(); + + string prompt = + _options.IncludeReasoning + ? Prompts.BuildEvaluationPromptWithReasoning( + renderedUserRequest, + renderedModelResponse, + renderedHistory) + : Prompts.BuildEvaluationPrompt( + renderedUserRequest, + renderedModelResponse, + renderedHistory); + + return prompt; + } + + /// + protected override async ValueTask ParseEvaluationResponseAsync( + string modelResponseForEvaluationPrompt, + EvaluationResult result, + ChatConfiguration chatConfiguration, + CancellationToken cancellationToken) + { + modelResponseForEvaluationPrompt = modelResponseForEvaluationPrompt.Trim(); + + try + { + Rating rating = Rating.FromJson(modelResponseForEvaluationPrompt); + UpdateResult(rating); + } + catch (JsonException) + { + try + { + string? repairedJson = + await JsonOutputFixer.RepairJsonAsync( + chatConfiguration, + modelResponseForEvaluationPrompt, + cancellationToken).ConfigureAwait(false); + + Rating rating = repairedJson is null ? Rating.Inconclusive : Rating.FromJson(repairedJson); + UpdateResult(rating); + } + catch (JsonException ex) + { + result.AddDiagnosticToAllMetrics(EvaluationDiagnostic.Error(ex.ToString())); + } + } + + void UpdateResult(Rating rating) + { + NumericMetric relevance = result.Get(RelevanceMetricName); + relevance.Value = rating.Relevance; + relevance.Interpretation = relevance.InterpretScore(); + if (!string.IsNullOrWhiteSpace(rating.RelevanceReasoning)) + { + relevance.AddDiagnostic(EvaluationDiagnostic.Informational(rating.RelevanceReasoning!)); + } + + NumericMetric truth = result.Get(TruthMetricName); + truth.Value = rating.Truth; + truth.Interpretation = truth.InterpretScore(); + if (!string.IsNullOrWhiteSpace(rating.TruthReasoning)) + { + truth.AddDiagnostic(EvaluationDiagnostic.Informational(rating.TruthReasoning!)); + } + + NumericMetric completeness = result.Get(CompletenessMetricName); + completeness.Value = rating.Completeness; + completeness.Interpretation = completeness.InterpretScore(); + if (!string.IsNullOrWhiteSpace(rating.CompletenessReasoning)) + { + completeness.AddDiagnostic(EvaluationDiagnostic.Informational(rating.CompletenessReasoning!)); + } + + if (!string.IsNullOrWhiteSpace(rating.Error)) + { + result.AddDiagnosticToAllMetrics(EvaluationDiagnostic.Error(rating.Error!)); + } + } + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluatorOptions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluatorOptions.cs new file mode 100644 index 00000000000..9271b2cc4af --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluatorOptions.cs @@ -0,0 +1,41 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable S3604 +// S3604: Member initializer values should not be redundant. +// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary +// constructor syntax. + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +///

+/// Options for . +///

+/// +/// If is set to , this instructs the +/// to include s (with +/// set to ) as +/// part of the returned s for 'Relevance' 'Truth' and 'Completeness' that explain the +/// reasoning behind the corresponding scores. By default, is set to +/// . +/// +public sealed class RelevanceTruthAndCompletenessEvaluatorOptions(bool includeReasoning = false) +{ + ///

+ /// Gets the default options for . + ///

+ /// + /// is set to by default. + /// + public static RelevanceTruthAndCompletenessEvaluatorOptions Default { get; } = + new RelevanceTruthAndCompletenessEvaluatorOptions(); + + ///

+ /// Gets a value indicating whether the should include + /// s (with set to + /// ) as part of the returned + /// s for 'Relevance' 'Truth' and 'Completeness' to explain the reasoning behind the + /// corresponding scores. By default, is set to . + ///

+ public bool IncludeReasoning { get; } = includeReasoning; +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/SingleNumericMetricEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/SingleNumericMetricEvaluator.cs new file mode 100644 index 00000000000..0145c1e7fb9 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/SingleNumericMetricEvaluator.cs @@ -0,0 +1,91 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Quality; + +///

+/// An base class that can be used to implement an AI-based that +/// produces an containing a single . +///

+public abstract class SingleNumericMetricEvaluator : ChatConversationEvaluator +{ + /// + public sealed override IReadOnlyCollection EvaluationMetricNames => [MetricName]; + + ///

+ /// Gets the of the produced by this + /// . + ///

+ protected abstract string MetricName { get; } + + /// + protected override ChatOptions? ChatOptions { get; } = + new ChatOptions + { + MaxOutputTokens = 1, + Temperature = 0.0f, + TopP = 1.0f, + PresencePenalty = 0.0f, + FrequencyPenalty = 0.0f, + ResponseFormat = ChatResponseFormat.Text + }; + + /// + protected sealed override string? SystemPrompt => + $""" + You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of + a response in a question-answering task. Your job is to compute an accurate evaluation score for the provided + evaluation metric based on the provided scoring guidance. + + This evaluation score should always be an integer between 1 and 5. So your response should be 1 or 2 or 3 or 4 + or 5. + + Your response should be a single character containing only the evaluation score. Do not include any other text + in your response besides the evaluation score. + """; + + // TASK: Explore using structured output and providing a JSON schema to better enforce the LLM response format + // requirements above. Tracked by https://github.com/dotnet/extensions/issues/5888. + + /// + protected sealed override EvaluationResult InitializeResult() + { + var metric = new NumericMetric(MetricName); + return new EvaluationResult(metric); + } + + /// + protected sealed override ValueTask ParseEvaluationResponseAsync( + string modelResponseForEvaluationPrompt, + EvaluationResult result, + ChatConfiguration chatConfiguration, + CancellationToken cancellationToken) + { + _ = Throw.IfNull(modelResponseForEvaluationPrompt, nameof(modelResponseForEvaluationPrompt)); + _ = Throw.IfNull(result, nameof(result)); + + modelResponseForEvaluationPrompt = modelResponseForEvaluationPrompt.Trim(); + + NumericMetric metric = result.Get(MetricName); + + if (int.TryParse(modelResponseForEvaluationPrompt, out int score)) + { + metric.Value = score; + } + else + { + metric.AddDiagnostic( + EvaluationDiagnostic.Error( + $"Failed to parse '{modelResponseForEvaluationPrompt}' as an integer score for '{MetricName}'.")); + } + + metric.Interpretation = metric.InterpretScore(); + + return default; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/Utilities/JsonOutputFixer.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/Utilities/JsonOutputFixer.cs new file mode 100644 index 00000000000..94529808cde --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/Utilities/JsonOutputFixer.cs @@ -0,0 +1,82 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace Microsoft.Extensions.AI.Evaluation.Quality.Utilities; + +internal static class JsonOutputFixer +{ + internal static ReadOnlySpan TrimMarkdownDelimiters(string json) + { +#if NET + ReadOnlySpan trimmed = json; +#else + ReadOnlySpan trimmed = json.ToCharArray(); +#endif + + // Trim whitespace and markdown characters from beginning and end. + trimmed = trimmed.Trim().Trim(['`']); + + // Trim 'json' marker from markdown if it exists. + const string JsonMarker = "json"; + int markerLength = JsonMarker.Length; + if (trimmed.Length > markerLength && trimmed[0..markerLength].SequenceEqual(JsonMarker.AsSpan())) + { + trimmed = trimmed.Slice(markerLength); + } + + return trimmed; + } + + internal static async ValueTask RepairJsonAsync( + ChatConfiguration chatConfig, + string json, + CancellationToken cancellationToken) + { + const string SystemPrompt = + """ + You are an AI assistant. Your job is to fix any syntax errors in a supplied JSON object so that it conforms + strictly to the JSON standard. Your response should include just the fixed JSON object and nothing else. + """; + + string fixPrompt = + $""" + Fix the following JSON object. Return exactly the same JSON object with the same data content but with any + syntax errors corrected. + + If the supplied text includes any markdown delimiters around the JSON object, strip out the markdown + delimiters and return just the fixed JSON object. Your response should start with an open curly brace and + end with a closing curly brace. + --- + {json} + """; + + ChatOptions chatOptions = + new ChatOptions + { + Temperature = 0.0f, + ResponseFormat = ChatResponseFormat.Json + }; + + var messages = new List + { + new ChatMessage(ChatRole.System, SystemPrompt), + new ChatMessage(ChatRole.User, fixPrompt) + }; + + // TASK: Explore supplying the target json type as a type parameter to the IChatClient.GetResponseAsync() + // extension method. Tracked by https://github.com/dotnet/extensions/issues/5888. + + ChatResponse response = + await chatConfig.ChatClient.GetResponseAsync( + messages, + chatOptions, + cancellationToken: cancellationToken).ConfigureAwait(false); + + return response.Message.Text; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/JsonSerialization/AzureStorageCamelCaseEnumConverter.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/JsonSerialization/AzureStorageCamelCaseEnumConverter.cs new file mode 100644 index 00000000000..2ec6cdb801f --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/JsonSerialization/AzureStorageCamelCaseEnumConverter.cs @@ -0,0 +1,11 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.JsonSerialization; + +internal sealed class AzureStorageCamelCaseEnumConverter() : + JsonStringEnumConverter(JsonNamingPolicy.CamelCase) + where TEnum : struct, System.Enum; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/JsonSerialization/AzureStorageSerializerContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/JsonSerialization/AzureStorageSerializerContext.cs new file mode 100644 index 00000000000..9e6dfc72224 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/JsonSerialization/AzureStorageSerializerContext.cs @@ -0,0 +1,31 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Text.Json; +using System.Text.Json.Serialization; +using static Microsoft.Extensions.AI.Evaluation.Reporting.Storage.AzureStorageResponseCache; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.JsonSerialization; + +[JsonSerializable(typeof(ScenarioRunResult))] +[JsonSerializable(typeof(CacheEntry))] +[JsonSourceGenerationOptions( + Converters = [ + typeof(AzureStorageCamelCaseEnumConverter), + typeof(AzureStorageCamelCaseEnumConverter), + typeof(AzureStorageTimeSpanConverter)], + WriteIndented = true, + IgnoreReadOnlyProperties = false, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, + PropertyNamingPolicy = JsonKnownNamingPolicy.CamelCase)] +internal sealed partial class AzureStorageSerializerContext : JsonSerializerContext +{ + private static AzureStorageSerializerContext? _compact; + + internal static AzureStorageSerializerContext Compact => + _compact ??= + new(new JsonSerializerOptions(Default.Options) + { + WriteIndented = false, + }); +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/JsonSerialization/AzureStorageTimeSpanConverter.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/JsonSerialization/AzureStorageTimeSpanConverter.cs new file mode 100644 index 00000000000..0c064ededd3 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/JsonSerialization/AzureStorageTimeSpanConverter.cs @@ -0,0 +1,17 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.JsonSerialization; + +internal sealed class AzureStorageTimeSpanConverter : JsonConverter +{ + public override TimeSpan Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) + => TimeSpan.FromSeconds(reader.GetDouble()); + + public override void Write(Utf8JsonWriter writer, TimeSpan value, JsonSerializerOptions options) + => writer.WriteNumberValue(value.TotalSeconds); +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Microsoft.Extensions.AI.Evaluation.Reporting.Azure.csproj b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Microsoft.Extensions.AI.Evaluation.Reporting.Azure.csproj new file mode 100644 index 00000000000..f705add750e --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Microsoft.Extensions.AI.Evaluation.Reporting.Azure.csproj @@ -0,0 +1,32 @@ + + + + A library that provides additional an additional storage provider based on Azure Storage containers. + $(TargetFrameworks);netstandard2.0 + Microsoft.Extensions.AI.Evaluation.Reporting + + $(NoWarn);EA0002 + + + + AIEval + preview + true + false + 88 + 0 + + + + + + + + + + + + + + + diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/README.md new file mode 100644 index 00000000000..09345b5e58c --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/README.md @@ -0,0 +1,46 @@ +# The Microsoft.Extensions.AI.Evaluation libraries + +`Microsoft.Extensions.AI.Evaluation` is a set of .NET libraries defined in the following NuGet packages that have been designed to work together to support building processes for evaluating the quality of AI software. + +* [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation. +* [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Equivalence and Groundedness. +* [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. +* [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. +* [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. + +## Install the packages + +From the command-line: + +```console +dotnet add package Microsoft.Extensions.AI.Evaluation +dotnet add package Microsoft.Extensions.AI.Evaluation.Quality +dotnet add package Microsoft.Extensions.AI.Evaluation.Reporting +``` + +Or directly in the C# project file: + +```xml + + + + + +``` + +You can optionally add the `Microsoft.Extensions.AI.Evaluation.Reporting.Azure` package in either of these places if you need Azure Storage support. + +## Install the command line tool + +```console +dotnet tool install Microsoft.Extensions.AI.Evaluation.Console --create-manifest-if-needed +``` + +## Usage Examples + +For a comprehensive tour of all the functionality, concepts and APIs available in the `Microsoft.Extensions.AI.Evaluation` libraries, check out the [API Usage Examples](https://github.com/dotnet/ai-samples/blob/main/src/microsoft-extensions-ai-evaluation/api/) available in the [dotnet/ai-samples](https://github.com/dotnet/ai-samples) repo. These examples are structured as a collection of unit tests. Each unit test showcases a specific concept or API, and builds on the concepts and APIs showcased in previous unit tests. + + +## Feedback & Contributing + +We welcome feedback and contributions in [our GitHub repo](https://github.com/dotnet/extensions). diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageReportingConfiguration.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageReportingConfiguration.cs new file mode 100644 index 00000000000..9ba8f5db079 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageReportingConfiguration.cs @@ -0,0 +1,76 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using Azure.Storage.Files.DataLake; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.Storage; + +///

+/// Contains factory method for creating a that persists +/// s to Azure Storage and also uses the storage to cache AI responses. +///

+public static class AzureStorageReportingConfiguration +{ + ///

+ /// Creates a that persists s to Azure Storage + /// and also uses the storage to cache AI responses. + ///

+ /// + /// A with access to an Azure Storage container under which the + /// s and all cached AI responses should be stored. + /// + /// + /// The set of s that should be invoked to evaluate AI responses. + /// + /// + /// An optional that specifies the maximum amount of time that cached AI responses should + /// survive in the cache before they are considered expired and evicted. + /// + /// + /// A that specifies the and the + /// that are used by AI-based included in the + /// returned . Can be omitted if none of the included + /// are AI-based. + /// + /// + /// to enable caching of AI responses; otherwise. + /// + /// + /// An optional collection of unique strings that should be hashed when generating the cache keys for cached AI + /// responses. See for more information about this concept. + /// + /// + /// The name of the current execution. See for more information about this + /// concept. Uses a fixed default value "Default" if omitted. + /// + /// + /// A that persists s to Azure Storage + /// and also uses Azure Storage to cache AI responses. + /// + public static ReportingConfiguration Create( + DataLakeDirectoryClient client, + IEnumerable evaluators, + TimeSpan? timeToLiveForCacheEntries = null, + ChatConfiguration? chatConfiguration = null, + bool enableResponseCaching = true, + IEnumerable? cachingKeys = null, + string executionName = Defaults.DefaultExecutionName) + { + IResponseCacheProvider? responseCacheProvider = + chatConfiguration is not null && enableResponseCaching + ? new AzureStorageResponseCacheProvider(client, timeToLiveForCacheEntries) + : null; + + IResultStore resultStore = new AzureStorageResultStore(client); + + return new ReportingConfiguration( + evaluators, + resultStore, + chatConfiguration, + responseCacheProvider, + cachingKeys, + executionName); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageResponseCache.CacheEntry.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageResponseCache.CacheEntry.cs new file mode 100644 index 00000000000..8c6153334f8 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageResponseCache.CacheEntry.cs @@ -0,0 +1,101 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable S3604 +// S3604: Member initializer values should not be redundant. +// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary +// constructor syntax. + +using System; +using System.Globalization; +using System.IO; +using System.Text.Json; +using System.Text.Json.Serialization; +using System.Threading; +using System.Threading.Tasks; +using Azure; +using Azure.Storage.Files.DataLake; +using Azure.Storage.Files.DataLake.Models; +using Microsoft.Extensions.AI.Evaluation.Reporting.JsonSerialization; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.Storage; + +public partial class AzureStorageResponseCache +{ + [method: JsonConstructor] + internal sealed class CacheEntry( + string scenarioName, + string iterationName, + DateTime creation, + DateTime expiration) + { + private const string DeserializationFailedMessage = "Unable to deserialize the cache entry file at {0}."; + + public string ScenarioName { get; } = scenarioName; + public string IterationName { get; } = iterationName; + public DateTime Creation { get; } = creation; + public DateTime Expiration { get; } = expiration; + + public static CacheEntry Read( + DataLakeFileClient fileClient, + CancellationToken cancellationToken = default) + { + Response content = fileClient.ReadContent(cancellationToken); + + CacheEntry cacheEntry = + JsonSerializer.Deserialize( + content.Value.Content.ToMemory().Span, + AzureStorageSerializerContext.Default.CacheEntry) + ?? throw new JsonException( + string.Format(CultureInfo.CurrentCulture, DeserializationFailedMessage, fileClient.Name)); + + return cacheEntry; + } + + public static async Task ReadAsync( + DataLakeFileClient fileClient, + CancellationToken cancellationToken = default) + { + Response content = + await fileClient.ReadContentAsync(cancellationToken).ConfigureAwait(false); + + CacheEntry cacheEntry = + await JsonSerializer.DeserializeAsync( + content.Value.Content.ToStream(), + AzureStorageSerializerContext.Default.CacheEntry, + cancellationToken).ConfigureAwait(false) + ?? throw new JsonException( + string.Format(CultureInfo.CurrentCulture, DeserializationFailedMessage, fileClient.Name)); + + return cacheEntry; + } + + public void Write( + DataLakeFileClient fileClient, + CancellationToken cancellationToken = default) + { + MemoryStream stream = new(); + + JsonSerializer.Serialize(stream, this, AzureStorageSerializerContext.Default.CacheEntry); + + _ = stream.Seek(0, SeekOrigin.Begin); + _ = fileClient.Upload(stream, overwrite: true, cancellationToken); + } + + public async Task WriteAsync( + DataLakeFileClient fileClient, + CancellationToken cancellationToken = default) + { + MemoryStream stream = new(); + + await JsonSerializer.SerializeAsync( + stream, + this, + AzureStorageSerializerContext.Default.CacheEntry, + cancellationToken).ConfigureAwait(false); + + _ = stream.Seek(0, SeekOrigin.Begin); + _ = await fileClient.UploadAsync(stream, overwrite: true, cancellationToken).ConfigureAwait(false); + } + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageResponseCache.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageResponseCache.cs new file mode 100644 index 00000000000..baa3b5360f3 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageResponseCache.cs @@ -0,0 +1,329 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable S3604 +// S3604: Member initializer values should not be redundant. +// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary +// constructor syntax. + +#pragma warning disable CA1725 +// CA1725: Parameter names should match base declaration. +// All functions on 'IDistributedCache' use the parameter name 'token' in place of 'cancellationToken'. However, +// changing the name of the corresponding parameters below to 'token' (in order to fix CA1725) would make the names +// inconsistent with the rest of the codebase. So we suppress this warning. + +using System; +using System.Globalization; +using System.IO; +using System.Threading; +using System.Threading.Tasks; +using Azure; +using Azure.Storage.Files.DataLake; +using Azure.Storage.Files.DataLake.Models; +using Azure.Storage.Files.DataLake.Specialized; +using Microsoft.Extensions.Caching.Distributed; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.Storage; + +///

+/// An implementation that stores cached AI responses for a particular +/// under an Azure Storage container. +///

+/// +/// A with access to an Azure Storage container under which the cached AI +/// responses should be stored. +/// +/// +/// The for the returned instance. +/// +/// +/// The for the returned instance. +/// +/// +/// An optional that specifies the maximum amount of time that cached AI responses should +/// survive in the cache before they are considered expired and evicted. +/// +public sealed partial class AzureStorageResponseCache( + DataLakeDirectoryClient client, + string scenarioName, + string iterationName, + TimeSpan? timeToLiveForCacheEntries = null) : IDistributedCache +{ + private const string EntryFileName = "entry.json"; + private const string ContentsFileName = "contents.data"; + + private const string EntryFileNotFound = "Cache entry file {0} was not found."; + private const string ContentsFileNotFound = "Cache contents file {0} was not found."; + private const string EntryAndContentsFilesNotFound = "Cache entry file {0} and contents file {1} were not found."; + + private readonly string _iterationPath = $"cache/{scenarioName}/{iterationName}"; + private readonly TimeSpan _timeToLiveForCacheEntries = + timeToLiveForCacheEntries ?? Defaults.DefaultTimeToLiveForCacheEntries; + private readonly Func _provideDateTime = () => DateTime.UtcNow; + + /// + /// Intended for testing purposes only. + /// + internal AzureStorageResponseCache( + DataLakeDirectoryClient client, + string scenarioName, + string iterationName, + TimeSpan? timeToLiveForCacheEntries, + Func provideDateTime) + : this(client, scenarioName, iterationName, timeToLiveForCacheEntries) + { + _provideDateTime = provideDateTime; + } + + /// + public byte[]? Get(string key) + { + (string entryFilePath, string contentsFilePath, bool filesExist) = CheckPaths(key); + + if (!filesExist) + { + return null; + } + + CacheEntry entry = CacheEntry.Read(client.GetFileClient(entryFilePath)); + if (entry.Expiration <= _provideDateTime()) + { + Remove(key); + return null; + } + + return client.GetFileClient(contentsFilePath).ReadContent().Value.Content.ToArray(); + } + + /// + public async Task GetAsync(string key, CancellationToken cancellationToken = default) + { + (string entryFilePath, string contentsFilePath, bool filesExist) = + await CheckPathsAsync(key, cancellationToken).ConfigureAwait(false); + + if (!filesExist) + { + return null; + } + + CacheEntry entry = + await CacheEntry.ReadAsync( + client.GetFileClient(entryFilePath), + cancellationToken: cancellationToken).ConfigureAwait(false); + + if (entry.Expiration <= _provideDateTime()) + { + await RemoveAsync(key, cancellationToken).ConfigureAwait(false); + + return null; + } + + Response content = + await client.GetFileClient(contentsFilePath).ReadContentAsync(cancellationToken).ConfigureAwait(false); + + return content.Value.Content.ToArray(); + } + + /// + public void Refresh(string key) + { + (string entryFilePath, string contentsFilePath, bool filesExist) = CheckPaths(key); + + if (!filesExist) + { + throw new FileNotFoundException( + string.Format( + CultureInfo.CurrentCulture, + EntryAndContentsFilesNotFound, + entryFilePath, + contentsFilePath)); + } + + DataLakeFileClient entryFileClient = client.GetFileClient(entryFilePath); + + CacheEntry entry = CreateEntry(); + entry.Write(entryFileClient); + } + + /// + public async Task RefreshAsync(string key, CancellationToken cancellationToken = default) + { + (string entryFilePath, string contentsFilePath, bool filesExist) = + await CheckPathsAsync(key, cancellationToken).ConfigureAwait(false); + + if (!filesExist) + { + throw new FileNotFoundException( + string.Format( + CultureInfo.CurrentCulture, + EntryAndContentsFilesNotFound, + entryFilePath, + contentsFilePath)); + } + + DataLakeFileClient entryClient = client.GetFileClient(entryFilePath); + + CacheEntry entry = CreateEntry(); + await entry.WriteAsync(entryClient, cancellationToken: cancellationToken).ConfigureAwait(false); + } + + /// + public void Remove(string key) + { + (string entryFilePath, string contentsFilePath) = GetPaths(key); + + DataLakeFileClient entryClient = client.GetFileClient(entryFilePath); + DataLakeFileClient contentsClient = client.GetFileClient(contentsFilePath); + + _ = entryClient.Delete(); + _ = contentsClient.Delete(); + } + + /// + public async Task RemoveAsync(string key, CancellationToken cancellationToken = default) + { + (string entryFilePath, _) = GetPaths(key); + + DataLakeDirectoryClient keyDirClient = client.GetFileClient(entryFilePath).GetParentDirectoryClient(); + + _ = await keyDirClient.DeleteAsync( + recursive: true, + cancellationToken: cancellationToken).ConfigureAwait(false); + } + + /// + public void Set(string key, byte[] value, DistributedCacheEntryOptions options) + { + (string entryFilePath, string contentsFilePath) = GetPaths(key); + + DataLakeFileClient entryClient = client.GetFileClient(entryFilePath); + DataLakeFileClient contentsClient = client.GetFileClient(contentsFilePath); + + CacheEntry entry = CreateEntry(); + entry.Write(entryClient); + + _ = contentsClient.Upload(BinaryData.FromBytes(value).ToStream(), overwrite: true); + } + + /// + public async Task SetAsync( + string key, + byte[] value, + DistributedCacheEntryOptions options, + CancellationToken cancellationToken = default) + { + (string entryFilePath, string contentsFilePath) = GetPaths(key); + + DataLakeFileClient entryClient = client.GetFileClient(entryFilePath); + DataLakeFileClient contentsClient = client.GetFileClient(contentsFilePath); + + CacheEntry entry = CreateEntry(); + await entry.WriteAsync(entryClient, cancellationToken: cancellationToken).ConfigureAwait(false); + + _ = await contentsClient.UploadAsync( + BinaryData.FromBytes(value).ToStream(), + overwrite: true, cancellationToken).ConfigureAwait(false); + } + + internal static async ValueTask ResetStorageAsync( + DataLakeDirectoryClient client, + CancellationToken cancellationToken = default) + { + _ = await client.DeleteIfExistsAsync( + recursive: true, + cancellationToken: cancellationToken).ConfigureAwait(false); + } + + internal static async ValueTask DeleteExpiredEntriesAsync( + DataLakeDirectoryClient client, + Func provideDateTime, + CancellationToken cancellationToken = default) + { + await foreach (PathItem pathItem in + client.GetPathsAsync(recursive: true, cancellationToken: cancellationToken).ConfigureAwait(false)) + { + if (pathItem.Name.EndsWith($"/{EntryFileName}", StringComparison.Ordinal)) + { + DataLakeFileClient entryFileClient = client.GetParentFileSystemClient().GetFileClient(pathItem.Name); + + CacheEntry entry = + await CacheEntry.ReadAsync( + entryFileClient, + cancellationToken: cancellationToken).ConfigureAwait(false); + + if (entry.Expiration <= provideDateTime()) + { + DataLakeDirectoryClient parentDirectory = entryFileClient.GetParentDirectoryClient(); + + _ = await parentDirectory.DeleteAsync( + recursive: true, + cancellationToken: cancellationToken).ConfigureAwait(false); + } + } + } + } + + private (string entryFilePath, string contentsFilePath) GetPaths(string key) + { + string entryFilePath = $"{_iterationPath}/{key}/{EntryFileName}"; + string contentsFilePath = $"{_iterationPath}/{key}/{ContentsFileName}"; + + return (entryFilePath, contentsFilePath); + } + + private async ValueTask<(string entryFilePath, string contentsFilePath, bool filesExist)> CheckPathsAsync( + string key, + CancellationToken cancellationToken) + { + (string entryFilePath, string contentsFilePath) = GetPaths(key); + + DataLakeFileClient entryClient = client.GetFileClient(entryFilePath); + bool entryFileExists = await entryClient.ExistsAsync(cancellationToken).ConfigureAwait(false); + + DataLakeFileClient contentsClient = client.GetFileClient(contentsFilePath); + bool contentsFileExists = await contentsClient.ExistsAsync(cancellationToken).ConfigureAwait(false); + + if (entryFileExists == contentsFileExists) + { + return (entryFilePath, contentsFilePath, filesExist: contentsFileExists); + } + else + { + throw new FileNotFoundException( + contentsFileExists + ? string.Format(CultureInfo.CurrentCulture, EntryFileNotFound, entryFilePath) + : string.Format(CultureInfo.CurrentCulture, ContentsFileNotFound, contentsFilePath)); + } + } + + private (string entryFilePath, string contentsFilePath, bool filesExist) CheckPaths(string key) + { + (string entryFilePath, string contentsFilePath) = GetPaths(key); + + DataLakeFileClient entryClient = client.GetFileClient(entryFilePath); + bool entryFileExists = entryClient.Exists(); + + DataLakeFileClient contentsClient = client.GetFileClient(contentsFilePath); + bool contentsFileExists = contentsClient.Exists(); + + if (entryFileExists == contentsFileExists) + { + return (entryFilePath, contentsFilePath, filesExist: contentsFileExists); + } + else + { + throw new FileNotFoundException( + contentsFileExists + ? string.Format(CultureInfo.CurrentCulture, EntryFileNotFound, entryFilePath) + : string.Format(CultureInfo.CurrentCulture, ContentsFileNotFound, contentsFilePath)); + } + } + + private CacheEntry CreateEntry() + { + DateTime creation = _provideDateTime(); + DateTime expiration = creation.Add(_timeToLiveForCacheEntries); + + return new CacheEntry(scenarioName, iterationName, creation, expiration); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageResponseCacheProvider.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageResponseCacheProvider.cs new file mode 100644 index 00000000000..e81e6c98fd8 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageResponseCacheProvider.cs @@ -0,0 +1,70 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable S3604 +// S3604: Member initializer values should not be redundant. +// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary +// constructor syntax. + +using System; +using System.Threading; +using System.Threading.Tasks; +using Azure.Storage.Files.DataLake; +using Microsoft.Extensions.Caching.Distributed; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.Storage; + +///

+/// An that returns a . +///

+/// +/// A with access to an Azure Storage container under which the cached AI +/// responses should be stored. +/// +/// +/// An optional that specifies the maximum amount of time that cached AI responses should +/// survive in the cache before they are considered expired and evicted. +/// +public sealed class AzureStorageResponseCacheProvider( + DataLakeDirectoryClient client, + TimeSpan? timeToLiveForCacheEntries = null) : IResponseCacheProvider +{ + private readonly Func _provideDateTime = () => DateTime.Now; + + /// + /// Intended for testing purposes only. + /// + internal AzureStorageResponseCacheProvider( + DataLakeDirectoryClient client, + TimeSpan? timeToLiveForCacheEntries, + Func provideDateTime) + : this(client, timeToLiveForCacheEntries) + { + _provideDateTime = provideDateTime; + } + + /// + public ValueTask GetCacheAsync( + string scenarioName, + string iterationName, + CancellationToken cancellationToken = default) + { + var cache = + new AzureStorageResponseCache( + client, + scenarioName, + iterationName, + timeToLiveForCacheEntries, + _provideDateTime); + + return new ValueTask(cache); + } + + /// + public ValueTask ResetAsync(CancellationToken cancellationToken = default) + => AzureStorageResponseCache.ResetStorageAsync(client, cancellationToken); + + /// + public ValueTask DeleteExpiredCacheEntriesAsync(CancellationToken cancellationToken = default) + => AzureStorageResponseCache.DeleteExpiredEntriesAsync(client, _provideDateTime, cancellationToken); +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageResultStore.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageResultStore.cs new file mode 100644 index 00000000000..7636f8901a5 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageResultStore.cs @@ -0,0 +1,212 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Globalization; +using System.IO; +using System.Runtime.CompilerServices; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using Azure; +using Azure.Storage.Files.DataLake; +using Azure.Storage.Files.DataLake.Models; +using Azure.Storage.Files.DataLake.Specialized; +using Microsoft.Extensions.AI.Evaluation.Reporting.JsonSerialization; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.Storage; + +///

+/// An implementation that stores s under an Azure Storage +/// container. +///

+/// +/// A with access to an Azure Storage container under which the +/// s should be stored. +/// +public sealed class AzureStorageResultStore(DataLakeDirectoryClient client) : IResultStore +{ + private const string ResultsRootPrefix = "results"; + + private const string DeserializationFailedMessage = "Unable to deserialize the scenario run result file at {0}."; + + /// + public async IAsyncEnumerable GetLatestExecutionNamesAsync( + int? count = null, + [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + int remaining = count ?? 1; + + (string path, _) = GetResultPath(); + DataLakeDirectoryClient subClient = client.GetSubDirectoryClient(path); + +#pragma warning disable S3254 // Default parameter value (for 'recursive') should not be passed as argument. + await foreach (PathItem item in + subClient.GetPathsAsync(recursive: false, cancellationToken: cancellationToken).ConfigureAwait(false)) +#pragma warning restore S3254 + { + if (remaining > 0) + { + yield return GetLastSegmentFromPath(item.Name); + remaining--; + } + else + { + break; + } + } + } + + /// + public async IAsyncEnumerable GetScenarioNamesAsync( + string executionName, + [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + (string path, _) = GetResultPath(executionName); + DataLakeDirectoryClient subClient = client.GetSubDirectoryClient(path); + +#pragma warning disable S3254 // Default parameter value (for 'recursive') should not be passed as argument. + await foreach (PathItem item in + subClient.GetPathsAsync(recursive: false, cancellationToken: cancellationToken).ConfigureAwait(false)) +#pragma warning restore S3254 + { + yield return GetLastSegmentFromPath(item.Name); + } + } + + /// + public async IAsyncEnumerable GetIterationNamesAsync( + string executionName, + string scenarioName, + [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + (string path, _) = GetResultPath(executionName, scenarioName); + DataLakeDirectoryClient subClient = client.GetSubDirectoryClient(path); + +#pragma warning disable S3254 // Default parameter value (for 'recursive') should not be passed as argument. + await foreach (PathItem item in + subClient.GetPathsAsync(recursive: false, cancellationToken: cancellationToken).ConfigureAwait(false)) +#pragma warning restore S3254 + { + yield return StripExtension(GetLastSegmentFromPath(item.Name)); + } + } + + /// + public async IAsyncEnumerable ReadResultsAsync( + string? executionName = null, + string? scenarioName = null, + string? iterationName = null, + [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + (string path, _) = GetResultPath(executionName, scenarioName, iterationName); + DataLakeDirectoryClient subClient = client.GetSubDirectoryClient(path); + + await foreach (PathItem pathItem in + subClient.GetPathsAsync(recursive: true, cancellationToken: cancellationToken).ConfigureAwait(false)) + { + if (pathItem.IsDirectory ?? true) + { + continue; + } + + DataLakeFileClient fileClient = client.GetParentFileSystemClient().GetFileClient(pathItem.Name); + + Response content = + await fileClient.ReadContentAsync(cancellationToken).ConfigureAwait(false); + + ScenarioRunResult? result = await JsonSerializer.DeserializeAsync( + content.Value.Content.ToStream(), + AzureStorageSerializerContext.Default.ScenarioRunResult, + cancellationToken).ConfigureAwait(false) + ?? throw new JsonException( + string.Format(CultureInfo.CurrentCulture, DeserializationFailedMessage, fileClient.Name)); + + yield return result; + } + } + + /// + public async ValueTask DeleteResultsAsync( + string? executionName = null, + string? scenarioName = null, + string? iterationName = null, + CancellationToken cancellationToken = default) + { + (string path, bool isDir) = GetResultPath(executionName, scenarioName, iterationName); + + if (isDir) + { + _ = await client + .GetSubDirectoryClient(path) + .DeleteIfExistsAsync(recursive: true, cancellationToken: cancellationToken).ConfigureAwait(false); + } + else + { + _ = await client + .GetFileClient(path) + .DeleteIfExistsAsync(cancellationToken: cancellationToken).ConfigureAwait(false); + } + } + + /// + public async ValueTask WriteResultsAsync( + IEnumerable results, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(results, nameof(results)); + + foreach (ScenarioRunResult result in results) + { + cancellationToken.ThrowIfCancellationRequested(); + + (string path, _) = GetResultPath(result.ExecutionName, result.ScenarioName, result.IterationName); + + DataLakeFileClient fileClient = client.GetFileClient(path); + + MemoryStream stream = new(); + + await JsonSerializer.SerializeAsync( + stream, + result, + AzureStorageSerializerContext.Default.ScenarioRunResult, + cancellationToken).ConfigureAwait(false); + + _ = stream.Seek(0, SeekOrigin.Begin); + + _ = await fileClient.UploadAsync( + stream, + overwrite: true, + cancellationToken: cancellationToken).ConfigureAwait(false); + } + } + + private static string GetLastSegmentFromPath(string name) + => name.Substring(name.LastIndexOf('/') + 1); + + private static string StripExtension(string name) + => name.Substring(0, name.LastIndexOf(".", StringComparison.Ordinal)); + + private static (string path, bool isDir) GetResultPath( + string? executionName = null, + string? scenarioName = null, + string? iterationName = null) + { + if (executionName is null) + { + return ($"{ResultsRootPrefix}/", isDir: true); + } + else if (scenarioName is null) + { + return ($"{ResultsRootPrefix}/{executionName}/", isDir: true); + } + else if (iterationName is null) + { + return ($"{ResultsRootPrefix}/{executionName}/{scenarioName}/", isDir: true); + } + + return ($"{ResultsRootPrefix}/{executionName}/{scenarioName}/{iterationName}.json", isDir: false); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Defaults.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Defaults.cs new file mode 100644 index 00000000000..777d723b790 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Defaults.cs @@ -0,0 +1,33 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Threading; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting; + +///

+/// A static class that contains default values for various reporting artifacts. +///

+public static class Defaults +{ + ///

+ /// The default execution name that should be used if one was not specified as part of the + /// . + ///

+ public const string DefaultExecutionName = "Default"; + + ///

+ /// The default iteration name that should be used if one was not specified when creating a + /// via + /// . + ///

+ public const string DefaultIterationName = "1"; + + ///

+ /// Gets a that specifies the default amount of time that cached AI responses should survive + /// in the 's cache before they are considered expired and evicted. + ///

+ public static TimeSpan DefaultTimeToLiveForCacheEntries { get; } = TimeSpan.FromDays(14); +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Formats/Dataset.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Formats/Dataset.cs new file mode 100644 index 00000000000..1fb8b6c5ec9 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Formats/Dataset.cs @@ -0,0 +1,24 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable S3604 +// S3604: Member initializer values should not be redundant. +// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary +// constructor syntax. + +using System; +using System.Collections.Generic; +using System.Text.Json.Serialization; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.Formats; + +[method: JsonConstructor] +internal sealed class Dataset( + IList scenarioRunResults, + DateTime createdAt, + string? generatorVersion) +{ + public IList ScenarioRunResults { get; } = scenarioRunResults; + public DateTime CreatedAt { get; } = createdAt; + public string? GeneratorVersion { get; } = generatorVersion; +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Formats/Html/HtmlReportWriter.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Formats/Html/HtmlReportWriter.cs new file mode 100644 index 00000000000..970341b86b5 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Formats/Html/HtmlReportWriter.cs @@ -0,0 +1,102 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Reporting.JsonSerialization; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.Formats.Html; + +///

+/// An that generates an HTML report containing all the +/// s present in the supplied s and writes it to the +/// specified . +///

+/// +/// The path to a file where the report will be written. If the file already exists, it will be overwritten. +/// +public sealed class HtmlReportWriter(string reportFilePath) : IEvaluationReportWriter +{ + /// + public async ValueTask WriteReportAsync( + IEnumerable scenarioRunResults, + CancellationToken cancellationToken = default) + { + var dataset = + new Dataset( + scenarioRunResults.ToList(), + createdAt: DateTime.UtcNow, + generatorVersion: Constants.Version); + + using var stream = + new FileStream( + reportFilePath, + FileMode.Create, + FileAccess.Write, + FileShare.None, + bufferSize: 4096, + useAsync: true); + + using var writer = new StreamWriter(stream, Encoding.UTF8); + +#if NET + await writer.WriteAsync(HtmlTemplateBefore.AsMemory(), cancellationToken).ConfigureAwait(false); + await writer.FlushAsync(cancellationToken).ConfigureAwait(false); +#else + await writer.WriteAsync(HtmlTemplateBefore).ConfigureAwait(false); + await writer.FlushAsync().ConfigureAwait(false); +#endif + + await JsonSerializer.SerializeAsync( + stream, + dataset, + SerializerContext.Compact.Dataset, + cancellationToken).ConfigureAwait(false); + +#if NET + await writer.WriteAsync(HtmlTemplateAfter.AsMemory(), cancellationToken).ConfigureAwait(false); + await writer.FlushAsync(cancellationToken).ConfigureAwait(false); +#else + await writer.WriteAsync(HtmlTemplateAfter).ConfigureAwait(false); + await writer.FlushAsync().ConfigureAwait(false); +#endif + } + + private static string HtmlTemplateBefore { get; } + private static string HtmlTemplateAfter { get; } + +#pragma warning disable CA1065, S3877 + // CA1065, S3877: Do not raise exceptions in static constructors. + // We disable this warning because the exception is only thrown in catastrophic circumstances where we somehow + // failed to include the html templates in the assembly as part of the build process. This is highly unlikely to + // happen in practice. If this does happen somehow, it is better to fail fast and loudly. + static HtmlReportWriter() + { + using Stream resourceStream = + typeof(HtmlReportWriter).Assembly.GetManifestResourceStream("Reporting.HTML.index.html") + ?? throw new InvalidOperationException("Failed to load HTML template."); + + // TASK: Make this more efficient by scanning the stream rather than reading it all into memory. + using var reader = new StreamReader(resourceStream); + string all = reader.ReadToEnd(); + + // This is the placeholder for the results array in the template. + const string SearchString = @"{scenarioRunResults:[]}"; + + int start = all.IndexOf(SearchString, StringComparison.Ordinal); + if (start == -1) + { + throw new InvalidOperationException($"Placeholder '{SearchString}' not found in the HTML template."); + } + + HtmlTemplateBefore = all.Substring(0, start); + HtmlTemplateAfter = all.Substring(start + SearchString.Length); + } +#pragma warning restore CA1065 +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Formats/Json/JsonReportWriter.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Formats/Json/JsonReportWriter.cs new file mode 100644 index 00000000000..da7921df938 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Formats/Json/JsonReportWriter.cs @@ -0,0 +1,51 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.Reporting.JsonSerialization; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.Formats.Json; + +///

+/// An that generates a JSON report containing all the +/// s present in the supplied s and writes it to the +/// specified . +///

+/// +/// The path to a file where the report will be written. If the file already exists, it will be overwritten. +/// +public sealed class JsonReportWriter(string reportFilePath) : IEvaluationReportWriter +{ + /// + public async ValueTask WriteReportAsync( + IEnumerable scenarioRunResults, + CancellationToken cancellationToken = default) + { + var dataset = + new Dataset( + scenarioRunResults.ToList(), + createdAt: DateTime.UtcNow, + generatorVersion: Constants.Version); + + using var stream = + new FileStream( + reportFilePath, + FileMode.Create, + FileAccess.Write, + FileShare.None, + bufferSize: 4096, + useAsync: true); + + await JsonSerializer.SerializeAsync( + stream, + dataset, + SerializerContext.Default.Dataset, + cancellationToken).ConfigureAwait(false); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/IEvaluationReportWriter.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/IEvaluationReportWriter.cs new file mode 100644 index 00000000000..97c1fdca15e --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/IEvaluationReportWriter.cs @@ -0,0 +1,26 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting; + +///

+/// Generates a report containing all the s present in the supplied +/// s. +///

+public interface IEvaluationReportWriter +{ + ///

+ /// Writes a report containing all the s present in the supplied + /// s. + ///

+ /// An enumeration of s. + /// A that can cancel the operation. + /// A that represents the asynchronous operation. + ValueTask WriteReportAsync( + IEnumerable scenarioRunResults, + CancellationToken cancellationToken = default); +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/IResponseCacheProvider.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/IResponseCacheProvider.cs new file mode 100644 index 00000000000..6bc8ce25432 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/IResponseCacheProvider.cs @@ -0,0 +1,54 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Caching.Distributed; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting; + +///

+/// Provides a way to get the that caches the AI responses associated with a particular +/// . +///

+/// +/// can be used to set up caching of AI-generated responses (both the AI responses +/// under evaluation as well as the AI responses for the evaluations themselves). When caching is enabled, the AI +/// responses associated with each are stored in the that is +/// returned from this . So long as the inputs (such as the content included in the +/// requests, the AI model being invoked etc.) remain unchanged, subsequent evaluations of the same +/// use the cached responses instead of invoking the AI model to generate new ones. Bypassing +/// the AI model when the inputs remain unchanged results in faster execution at a lower cost. +/// +public interface IResponseCacheProvider +{ + ///

+ /// Returns an that caches the AI responses associated with a particular + /// . + ///

+ /// The . + /// The . + /// A that can cancel the operation. + /// + /// An that caches the AI responses associated with a particular + /// . + /// + ValueTask GetCacheAsync( + string scenarioName, + string iterationName, + CancellationToken cancellationToken = default); + + ///

+ /// Deletes cached AI responses for all s. + ///

+ /// A that can cancel the operation. + /// A that represents the asynchronous operation. + ValueTask ResetAsync(CancellationToken cancellationToken = default); + + ///

+ /// Deletes expired cache entries for all s. + ///

+ /// A that can cancel the operation. + /// A that represents the asynchronous operation. + ValueTask DeleteExpiredCacheEntriesAsync(CancellationToken cancellationToken = default); +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/IResultStore.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/IResultStore.cs new file mode 100644 index 00000000000..3f3dea6cc7a --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/IResultStore.cs @@ -0,0 +1,128 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting; + +///

+/// Represents a store for s. +///

+public interface IResultStore +{ + ///

+ /// Returns s for s filtered by the specified + /// , , and from + /// the store. + ///

+ /// + /// Returns all s in the store if , + /// , and are all omitted. + /// + /// + /// The by which the s should be filtered. + /// If omitted, all s are considered. + /// + /// + /// The by which the s should be filtered. + /// If omitted, all s that are in scope based on the specified + /// filter are considered. + /// + /// + /// The by which the s should be filtered. + /// If omitted, all s that are in scope based on the specified + /// , and filters are considered. + /// + /// A that can cancel the operation. + /// The matching s. + IAsyncEnumerable ReadResultsAsync( + string? executionName = null, + string? scenarioName = null, + string? iterationName = null, + CancellationToken cancellationToken = default); + + ///

+ /// Writes the supplied s to the store. + ///

+ /// The s to be written. + /// A that can cancel the operation. + /// A that represents the asynchronous operation. + ValueTask WriteResultsAsync(IEnumerable results, CancellationToken cancellationToken = default); + + ///

+ /// Deletes s for s filtered by the specified + /// , , and from + /// the store. + ///

+ /// + /// Deletes all s in the store if , + /// , and are all omitted. + /// + /// + /// The by which the s should be filtered. + /// If omitted, all s are considered. + /// + /// + /// The by which the s should be filtered. + /// If omitted, all s that are in scope based on the specified + /// filter are considered. + /// + /// + /// The by which the s should be filtered. + /// If omitted, all s that are in scope based on the specified + /// , and filters are considered. + /// + /// A that can cancel the operation. + /// A that represents the asynchronous operation. + ValueTask DeleteResultsAsync( + string? executionName = null, + string? scenarioName = null, + string? iterationName = null, + CancellationToken cancellationToken = default); + + ///

+ /// Gets the s of the most recent executions from + /// the store (ordered from most recent to least recent). + ///

+ /// The number of s to retrieve. + /// A that can cancel the operation. + /// + /// The s of the most recent executions from the + /// store (ordered from most recent to least recent). + /// + IAsyncEnumerable GetLatestExecutionNamesAsync( + int? count = null, + CancellationToken cancellationToken = default); + + ///

+ /// Gets the s present in the execution with the specified + /// . + ///

+ /// The . + /// A that can cancel the operation. + /// + /// The s present in the execution with the specified + /// . + /// + IAsyncEnumerable GetScenarioNamesAsync( + string executionName, + CancellationToken cancellationToken = default); + + ///

+ /// Gets the s present in the scenario with the specified + /// under the execution with the specified . + ///

+ /// The . + /// The . + /// A that can cancel the operation. + /// + /// The s present in the scenario with the specified + /// under the execution with the specified . + /// + IAsyncEnumerable GetIterationNamesAsync( + string executionName, + string scenarioName, + CancellationToken cancellationToken = default); +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/JsonSerialization/CamelCaseEnumConverter.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/JsonSerialization/CamelCaseEnumConverter.cs new file mode 100644 index 00000000000..a1f462f89ac --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/JsonSerialization/CamelCaseEnumConverter.cs @@ -0,0 +1,11 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.JsonSerialization; + +internal sealed class CamelCaseEnumConverter() : + JsonStringEnumConverter(JsonNamingPolicy.CamelCase) + where TEnum : struct, System.Enum; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/JsonSerialization/SerializerContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/JsonSerialization/SerializerContext.cs new file mode 100644 index 00000000000..315180c4892 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/JsonSerialization/SerializerContext.cs @@ -0,0 +1,35 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Text.Json; +using System.Text.Json.Serialization; +using Microsoft.Extensions.AI.Evaluation.Reporting.Formats; +using static Microsoft.Extensions.AI.Evaluation.Reporting.Storage.DiskBasedResponseCache; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.JsonSerialization; + +[JsonSerializable(typeof(EvaluationResult))] +[JsonSerializable(typeof(Dataset))] +[JsonSerializable(typeof(CacheEntry))] +[JsonSerializable(typeof(CacheOptions))] +[JsonSourceGenerationOptions( + Converters = [ + typeof(CamelCaseEnumConverter), + typeof(CamelCaseEnumConverter), + typeof(CamelCaseEnumConverter), + typeof(TimeSpanConverter)], + WriteIndented = true, + IgnoreReadOnlyProperties = false, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, + PropertyNamingPolicy = JsonKnownNamingPolicy.CamelCase)] +internal sealed partial class SerializerContext : JsonSerializerContext +{ + private static SerializerContext? _compact; + + internal static SerializerContext Compact => + _compact ??= + new(new JsonSerializerOptions(Default.Options) + { + WriteIndented = false, + }); +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/JsonSerialization/TimeSpanConverter.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/JsonSerialization/TimeSpanConverter.cs new file mode 100644 index 00000000000..f015c2dcbee --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/JsonSerialization/TimeSpanConverter.cs @@ -0,0 +1,17 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.JsonSerialization; + +internal sealed class TimeSpanConverter : JsonConverter +{ + public override TimeSpan Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) + => TimeSpan.FromSeconds(reader.GetDouble()); + + public override void Write(Utf8JsonWriter writer, TimeSpan value, JsonSerializerOptions options) + => writer.WriteNumberValue(value.TotalSeconds); +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Microsoft.Extensions.AI.Evaluation.Reporting.csproj b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Microsoft.Extensions.AI.Evaluation.Reporting.csproj new file mode 100644 index 00000000000..467c910b6d8 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Microsoft.Extensions.AI.Evaluation.Reporting.csproj @@ -0,0 +1,91 @@ + + + + A library for aggregating and reporting evaluation data. This library also includes support for caching LLM responses. + $(TargetFrameworks);netstandard2.0 + Microsoft.Extensions.AI.Evaluation.Reporting + + $(NoWarn);EA0002 + + + + AIEval + preview + true + false + 88 + 0 + + + + + + + + + + Reporting.HTML.index.html + + + + + + + + + + + + + + + + + + + $(IntermediateOutputPath)Constants.g.cs + +// +// This file is auto-generated by MSBuild. +// + +namespace Microsoft.Extensions.AI.Evaluation.Reporting%3B + +internal static class Constants +{ + public const string Version = "$(Version)"%3B +} + + + + + + + + + + + + + + + + ..\TypeScript\azure-devops-report\VSIXPackageVersion.json + $(VersionPrefix).42424242 + $(VersionPrefix).$(VersionSuffixDateStamp)$(VersionSuffixBuildOfTheDayPadded) + {"PackageVersion":"$(VSIXVersion)"} + + + + + diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/README.md new file mode 100644 index 00000000000..09345b5e58c --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/README.md @@ -0,0 +1,46 @@ +# The Microsoft.Extensions.AI.Evaluation libraries + +`Microsoft.Extensions.AI.Evaluation` is a set of .NET libraries defined in the following NuGet packages that have been designed to work together to support building processes for evaluating the quality of AI software. + +* [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation. +* [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Equivalence and Groundedness. +* [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. +* [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. +* [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. + +## Install the packages + +From the command-line: + +```console +dotnet add package Microsoft.Extensions.AI.Evaluation +dotnet add package Microsoft.Extensions.AI.Evaluation.Quality +dotnet add package Microsoft.Extensions.AI.Evaluation.Reporting +``` + +Or directly in the C# project file: + +```xml + + + + + +``` + +You can optionally add the `Microsoft.Extensions.AI.Evaluation.Reporting.Azure` package in either of these places if you need Azure Storage support. + +## Install the command line tool + +```console +dotnet tool install Microsoft.Extensions.AI.Evaluation.Console --create-manifest-if-needed +``` + +## Usage Examples + +For a comprehensive tour of all the functionality, concepts and APIs available in the `Microsoft.Extensions.AI.Evaluation` libraries, check out the [API Usage Examples](https://github.com/dotnet/ai-samples/blob/main/src/microsoft-extensions-ai-evaluation/api/) available in the [dotnet/ai-samples](https://github.com/dotnet/ai-samples) repo. These examples are structured as a collection of unit tests. Each unit test showcases a specific concept or API, and builds on the concepts and APIs showcased in previous unit tests. + + +## Feedback & Contributing + +We welcome feedback and contributions in [our GitHub repo](https://github.com/dotnet/extensions). diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ReportingConfiguration.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ReportingConfiguration.cs new file mode 100644 index 00000000000..ba8e0361c6e --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ReportingConfiguration.cs @@ -0,0 +1,238 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Caching.Distributed; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting; + +///

+/// Represents the configuration for a set of s that defines the set of +/// s that should be invoked, the that should be +/// used by these s, how the resulting s should be persisted, +/// and how AI responses should be cached. +///

+public sealed class ReportingConfiguration +{ + ///

+ /// Gets the set of s that should be invoked to evaluate AI responses. + ///

+ public IReadOnlyList Evaluators { get; } + + ///

+ /// Gets the that should be used to persist the s. + ///

+ public IResultStore ResultStore { get; } + + ///

+ /// Gets a that specifies the and the + /// that are used by AI-based included in this + /// . + ///

+ public ChatConfiguration? ChatConfiguration { get; } + + ///

+ /// Gets the that should be used to cache AI responses. + ///

+ public IResponseCacheProvider? ResponseCacheProvider { get; } + + ///

+ /// Gets the collection of unique strings that should be hashed when generating the cache keys for cached AI + /// responses. + ///

+ /// + /// + /// If no additional caching keys are supplied, then the cache keys for a cached response are generated based on + /// the content of the AI request that produced this response, metadata such as model name and endpoint present in + /// the configured and the that are supplied as part of + /// generating the response. + /// + /// + /// Additionally, the name of the scenario and the iteration are always included in the cache key. This means that + /// the cached responses for a particular scenario and iteration will not be reused for a different scenario and + /// iteration even if the AI request content and metadata happen to be the same. + /// + /// + /// Supplying additional caching keys can be useful when some external factors need to be considered when deciding + /// whether a cached AI response is still valid. For example, consider the case where one of the supplied + /// additional caching keys is the version of the AI model being invoked. If the product moves to a newer version + /// of the model, then updating the caching key to reflect this change will cause all cached entries that rely on + /// this caching key to be invalidated thereby ensuring that the subsequent evaluations will not use the outdated + /// cached responses produced by the previous model version. + /// + /// + public IReadOnlyList CachingKeys { get; } + + ///

+ /// Gets the name of the current execution. + ///

+ /// + /// See for more information about this concept. + /// + public string ExecutionName { get; } + + ///

+ /// Gets a function that can be optionally used to override s for + /// s returned from evaluations that use this . + ///

+ /// + /// The supplied function can either return a new for any + /// that is supplied to it, or return if the + /// should be left unchanged. + /// + public Func? EvaluationMetricInterpreter { get; } + + ///

+ /// Initializes a new instance of the class. + ///

+ /// + /// The set of s that should be invoked to evaluate AI responses. + /// + /// + /// The that should be used to persist the s. + /// + /// + /// A that specifies the and the + /// that are used by AI-based included in this + /// . Can be omitted if none of the included are + /// AI-based. + /// + /// + /// The that should be used to cache AI responses. If omitted, AI responses + /// will not be cached. + /// + /// + /// An optional collection of unique strings that should be hashed when generating the cache keys for cached AI + /// responses. See for more information about this concept. + /// + /// + /// The name of the current execution. See for more information about this + /// concept. Uses a fixed default value "Default" if omitted. + /// + /// + /// An optional function that can be used to override s for + /// s returned from evaluations that use this . + /// The supplied function can either return a new for any + /// that is supplied to it, or return if the + /// should be left unchanged. + /// + public ReportingConfiguration( + IEnumerable evaluators, + IResultStore resultStore, + ChatConfiguration? chatConfiguration = null, + IResponseCacheProvider? responseCacheProvider = null, + IEnumerable? cachingKeys = null, + string executionName = Defaults.DefaultExecutionName, + Func? evaluationMetricInterpreter = null) + { + Evaluators = [.. evaluators]; + ResultStore = resultStore; + ChatConfiguration = chatConfiguration; + ResponseCacheProvider = responseCacheProvider; + + cachingKeys ??= []; + if (chatConfiguration is not null) + { + cachingKeys = cachingKeys.Concat(GetCachingKeysForChatClient(chatConfiguration.ChatClient)); + } + + CachingKeys = [.. cachingKeys]; + ExecutionName = executionName; + EvaluationMetricInterpreter = evaluationMetricInterpreter; + } + + ///

+ /// Creates a new with the specified and + /// . + ///

+ /// The . + /// + /// The . Uses default value "1" if omitted. + /// + /// + /// An optional collection of unique strings that should be hashed when generating the cache keys for cached AI + /// responses. See for more information about this concept. + /// + /// A that can cancel the operation. + /// + /// A new with the specified and + /// . + /// + public async ValueTask CreateScenarioRunAsync( + string scenarioName, + string iterationName = Defaults.DefaultIterationName, + IEnumerable? additionalCachingKeys = null, + CancellationToken cancellationToken = default) + { + ChatConfiguration? chatConfiguration = ChatConfiguration; + + if (chatConfiguration is not null && ResponseCacheProvider is not null) + { + IChatClient originalChatClient = chatConfiguration.ChatClient; + + IEnumerable cachingKeys = + additionalCachingKeys is null + ? [scenarioName, iterationName, .. CachingKeys] + : [scenarioName, iterationName, .. CachingKeys, .. additionalCachingKeys]; + + IDistributedCache cache = + await ResponseCacheProvider.GetCacheAsync( + scenarioName, + iterationName, + cancellationToken).ConfigureAwait(false); + +#pragma warning disable CA2000 + // CA2000: Dispose objects before they go out of scope. + // ResponseCachingChatClient is a wrapper around the IChatClient supplied by the caller. Disposing + // ResponseCachingChatClient would also dispose the IChatClient supplied by the caller. Disposing this + // within the evaluation library is problematic because the caller would then lose control over the + // lifetime of the supplied IChatClient. We disable this warning because we want to give the caller + // complete control over the lifetime of the supplied IChatClient. + + var cachingChatClient = + new ResponseCachingChatClient( + originalChatClient, + cache, + cachingKeys); +#pragma warning restore CA2000 + + chatConfiguration = new ChatConfiguration(cachingChatClient, chatConfiguration.TokenCounter); + } + + return new ScenarioRun( + scenarioName, + iterationName, + ExecutionName, + Evaluators, + ResultStore, + chatConfiguration, + EvaluationMetricInterpreter); + } + + private static IEnumerable GetCachingKeysForChatClient(IChatClient chatClient) + { + var metadata = chatClient.GetService(); + + string? providerName = metadata?.ProviderName; + if (!string.IsNullOrWhiteSpace(providerName)) + { + yield return providerName!; + } + + Uri? providerUri = metadata?.ProviderUri; + if (providerUri is not null) + { + yield return providerUri.AbsoluteUri; + } + + string? modelId = metadata?.ModelId; + if (!string.IsNullOrWhiteSpace(modelId)) + { + yield return modelId!; + } + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ResponseCachingChatClient.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ResponseCachingChatClient.cs new file mode 100644 index 00000000000..848b6583ced --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ResponseCachingChatClient.cs @@ -0,0 +1,42 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using Microsoft.Extensions.Caching.Distributed; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting; + +///

+/// An that wraps another and caches all responses generated using +/// the wrapped in the supplied . +///

+public sealed class ResponseCachingChatClient : DistributedCachingChatClient +{ + private readonly IReadOnlyList _cachingKeys; + + ///

+ /// Initializes a new instance of the class that wraps the supplied + /// and caches all responses generated using + /// in the supplied . + ///

+ /// The that is wrapped. + /// The where the cached responses are stored. + /// + /// A collection of unique strings that should be hashed when generating the cache keys for cached AI responses. + /// See for more information about this concept. + /// + public ResponseCachingChatClient( + IChatClient originalChatClient, + IDistributedCache cache, + IEnumerable cachingKeys) + : base(originalChatClient, cache) + { + _cachingKeys = [.. cachingKeys]; + } + + /// + protected override string GetCacheKey(params ReadOnlySpan values) + => base.GetCacheKey([.. values, .. _cachingKeys]); + +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ScenarioRun.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ScenarioRun.cs new file mode 100644 index 00000000000..8dc189767f2 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ScenarioRun.cs @@ -0,0 +1,179 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting; + +///

+/// Represents a single execution of a particular iteration of a particular scenario under evaluation. +///

+ /// Gets the name of the scenario that this represents. + ///

+ /// + /// + /// The s of different scenarios within a particular evaluation run must be unique. + /// + /// + /// Logically, a scenario can be mapped to a single unit test within a suite of unit tests that are executed as + /// part of an evaluation. In this case, the for each in the + /// suite can be set to the fully qualified name of the corresponding unit test. + /// + /// + public string ScenarioName { get; } + + ///

+ /// Gets the name of the iteration that this represents. + ///

+ /// + /// + /// The s of different iterations within a particular scenario execution must be unique. + /// + /// + /// Logically, an iteration can be mapped to a single loop iteration within a particular unit test, or to a single + /// data row within a data-driven test. could be set to any string that uniquely + /// identifies the particular loop iteration / data row. For example, it could be set to an integer index that is + /// incremented with each loop iteration. + /// + /// + public string IterationName { get; } + + ///

+ /// Gets the name of the execution that this represents. + ///

+ /// + /// + /// can be set to any string that uniquely identifies a particular execution of a set + /// scenarios and iterations that are part of an evaluation run. For example, could be + /// set to the build number of the GitHub Actions workflow that runs the evaluation. Or it could be set to the + /// version number of the product being evaluated. It could also be set to a timestamp (so long as all + /// s in a particular evaluation run share the same timestamp for their + /// s). + /// + /// + /// As new builds / workflows are kicked off over time, this would produce a series of executions each with a + /// unique . The results for individual scenarios and iterations can then be compared + /// across these different executions to track how the s for each scenario and + /// iteration are trending over time. + /// + /// + /// If the supplied is not unique, then the results for the scenarios and iterations + /// from the previous execution with the same will be overwritten with the results from + /// the new execution. + /// + /// + public string ExecutionName { get; } + + ///

+ /// Gets a that specifies the and the + /// that are used by AI-based s that are invoked as + /// part of the evaluation of this . + ///

+ public ChatConfiguration? ChatConfiguration { get; } + + private readonly CompositeEvaluator _compositeEvaluator; + private readonly IResultStore _resultStore; + private readonly Func? _evaluationMetricInterpreter; + + private ScenarioRunResult? _result; + + internal ScenarioRun( + string scenarioName, + string iterationName, + string executionName, + IEnumerable evaluators, + IResultStore resultStore, + ChatConfiguration? chatConfiguration = null, + Func? evaluationMetricInterpreter = null) + { + ScenarioName = scenarioName; + IterationName = iterationName; + ExecutionName = executionName; + ChatConfiguration = chatConfiguration; + + _compositeEvaluator = new CompositeEvaluator(evaluators); + _resultStore = resultStore; + _evaluationMetricInterpreter = evaluationMetricInterpreter; + } + + ///

+ /// Evaluates the supplied and returns an + /// containing one or more s. + ///

+ /// + /// The conversation history including the request that produced the supplied . + /// + /// The response that is to be evaluated. + /// + /// Additional contextual information (beyond that which is available in ) that the + /// s included in this may need to accurately evaluate the + /// supplied . + /// + /// + /// A that can cancel the evaluation operation. + /// + /// An containing one or more s. + public async ValueTask EvaluateAsync( + IEnumerable messages, + ChatMessage modelResponse, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + if (_result is not null) + { +#pragma warning disable S103 // Lines should not be too long + throw new InvalidOperationException( + $"The {nameof(ScenarioRun)} with {nameof(ScenarioName)}: {ScenarioName}, {nameof(IterationName)}: {IterationName} and {nameof(ExecutionName)}: {ExecutionName} has already been evaluated. Do not call {nameof(EvaluateAsync)} more than once on a given {nameof(ScenarioRun)}."); +#pragma warning restore S103 + } + + EvaluationResult evaluationResult = + await _compositeEvaluator.EvaluateAsync( + messages, + modelResponse, + ChatConfiguration, + additionalContext, + cancellationToken).ConfigureAwait(false); + + if (_evaluationMetricInterpreter is not null) + { + evaluationResult.Interpret(_evaluationMetricInterpreter); + } + + _result = + new ScenarioRunResult( + ScenarioName, + IterationName, + ExecutionName, + creationTime: DateTime.UtcNow, + messages, + modelResponse, + evaluationResult); + + return evaluationResult; + } + + ///

+ /// Disposes the and writes the to the configured + /// . + ///

+ /// A that represents the asynchronous operation. + public async ValueTask DisposeAsync() + { + if (_result is not null) + { + await _resultStore.WriteResultsAsync([_result]).ConfigureAwait(false); + } + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ScenarioRunExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ScenarioRunExtensions.cs new file mode 100644 index 00000000000..3c9a8fd5d44 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ScenarioRunExtensions.cs @@ -0,0 +1,132 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting; + +///

+/// Extension methods for . +///

+public static class ScenarioRunExtensions +{ + ///

+ /// Evaluates the supplied and returns an + /// containing one or more s. + ///

+ /// The of which this evaluation is a part. + /// The response that is to be evaluated. + /// + /// Additional contextual information that the s included in this + /// may need to accurately evaluate the supplied . + /// + /// + /// A that can cancel the evaluation operation. + /// + /// An containing one or more s. + public static ValueTask EvaluateAsync( + this ScenarioRun scenarioRun, + string modelResponse, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) => + scenarioRun.EvaluateAsync( + modelResponse: new ChatMessage(ChatRole.Assistant, modelResponse), + additionalContext: additionalContext, + cancellationToken: cancellationToken); + + ///

+ /// Evaluates the supplied and returns an + /// containing one or more s. + ///

+ /// The of which this evaluation is a part. + /// The response that is to be evaluated. + /// + /// Additional contextual information that the s included in this + /// may need to accurately evaluate the supplied . + /// + /// + /// A that can cancel the evaluation operation. + /// + /// An containing one or more s. + public static ValueTask EvaluateAsync( + this ScenarioRun scenarioRun, + ChatMessage modelResponse, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(scenarioRun, nameof(scenarioRun)); + + return scenarioRun.EvaluateAsync( + messages: [], + modelResponse, + additionalContext, + cancellationToken); + } + + ///

+ /// Evaluates the supplied and returns an + /// containing one or more s. + ///

+ /// The of which this evaluation is a part. + /// + /// The request that produced the that is to be evaluated. + /// + /// The response that is to be evaluated. + /// + /// Additional contextual information (beyond that which is available in ) that the + /// s included in this may need to accurately evaluate the + /// supplied . + /// + /// + /// A that can cancel the evaluation operation. + /// + /// An containing one or more s. + public static ValueTask EvaluateAsync( + this ScenarioRun scenarioRun, + ChatMessage userRequest, + ChatMessage modelResponse, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(scenarioRun, nameof(scenarioRun)); + + return scenarioRun.EvaluateAsync( + messages: [userRequest], + modelResponse, + additionalContext, + cancellationToken); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ScenarioRunResult.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ScenarioRunResult.cs new file mode 100644 index 00000000000..22d9ff0167e --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ScenarioRunResult.cs @@ -0,0 +1,130 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable S3604 +// S3604: Member initializer values should not be redundant. +// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary +// constructor syntax. + +using System; +using System.Collections.Generic; +using System.Text.Json.Serialization; +using System.Threading; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting; + +///

+/// Represents the results of a single execution of a particular iteration of a particular scenario under evaluation. +/// In other words, represents the results of evaluating a +/// and includes the that is produced when +/// +/// is invoked. +///

+/// +/// Each execution of an evaluation run is assigned a unique . A single such evaluation run +/// can contain evaluations for multiple scenarios each with a unique . The execution of each +/// such scenario in turn can include multiple iterations each with a unique . +/// +/// The . +/// The . +/// The . +/// The time at which this was created. +/// +/// The conversation history including the request that produced the being evaluated. +/// +/// The response being evaluated. +/// +/// The for the corresponding to the +/// being constructed. +/// +[method: JsonConstructor] +public sealed class ScenarioRunResult( + string scenarioName, + string iterationName, + string executionName, + DateTime creationTime, + IList messages, + ChatMessage modelResponse, + EvaluationResult evaluationResult) +{ + ///

+ /// Initializes a new instance of the class. + ///

+ /// The . + /// The . + /// The . + /// The time at which this was created. + /// + /// The conversation history including the request that produced the being evaluated. + /// + /// The response being evaluated. + /// + /// The for the corresponding to the + /// being constructed. + /// + public ScenarioRunResult( + string scenarioName, + string iterationName, + string executionName, + DateTime creationTime, + IEnumerable messages, + ChatMessage modelResponse, + EvaluationResult evaluationResult) + : this( + scenarioName, + iterationName, + executionName, + creationTime, + [.. messages], + modelResponse, + evaluationResult) + { + } + + ///

+ /// Gets or sets the . + ///

+ public string ScenarioName { get; set; } = scenarioName; + + ///

+ /// Gets or sets the . + ///

+ public string IterationName { get; set; } = iterationName; + + ///

+ /// Gets or sets the . + ///

+ public string ExecutionName { get; set; } = executionName; + + ///

+ /// Gets or sets the time at which this was created. + ///

+ public DateTime CreationTime { get; set; } = creationTime; + + ///

+ /// Gets or sets the conversation history including the request that produced the being + /// evaluated in this . + ///

+#pragma warning disable CA2227 + // CA2227: Collection properties should be read only. + // We disable this warning because we want this type to be fully mutable for serialization purposes and for general + // convenience. + public IList Messages { get; set; } = messages; +#pragma warning restore CA2227 + + ///

+ /// Gets or sets the response being evaluated in this . + ///

+ public ChatMessage ModelResponse { get; set; } = modelResponse; + + ///

+ /// Gets or sets the for the corresponding to + /// this . + ///

+ /// + /// This is the same that is returned when + /// + /// is invoked. + /// + public EvaluationResult EvaluationResult { get; set; } = evaluationResult; +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ScenarioRunResultExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ScenarioRunResultExtensions.cs new file mode 100644 index 00000000000..8b82a7336cf --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ScenarioRunResultExtensions.cs @@ -0,0 +1,37 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting; + +///

+/// Extension methods for . +///

+public static class ScenarioRunResultExtensions +{ + ///

+ /// Returns if any contained in the supplied + /// contains an matching the supplied + /// ; otherwise. + ///

+ /// The that is to be inspected. + /// + /// A predicate that returns if a matching is found; + /// otherwise. + /// + /// + /// if any contained in the supplied + /// contains an matching the supplied + /// ; otherwise. + /// + public static bool ContainsDiagnostics( + this ScenarioRunResult result, + Func? predicate = null) + { + _ = Throw.IfNull(result, nameof(result)); + + return result.EvaluationResult.ContainsDiagnostics(predicate); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Storage/DiskBasedReportingConfiguration.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Storage/DiskBasedReportingConfiguration.cs new file mode 100644 index 00000000000..83a0812cb77 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/Storage/DiskBasedReportingConfiguration.cs @@ -0,0 +1,72 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.IO; + +namespace Microsoft.Extensions.AI.Evaluation.Reporting.Storage; + +///

+/// Contains factory method for creating a that persists +/// s to disk and also uses the disk to cache AI responses. +///

+public static class DiskBasedReportingConfiguration +{ + ///

+ /// Creates a that persists s to disk and also + /// uses the disk to cache AI responses. + ///

+ /// + /// The path to a directory on disk under which the s and all cached AI responses + /// should be stored. + /// + /// + /// The set of s that should be invoked to evaluate AI responses. + /// + /// + /// A that specifies the and the + /// that are used by AI-based included in the + /// returned