Implement The Evaluator Class, Ledger Write Pipeli...
Implement The Evaluator Class, Ledger Write Pipeli...
Updated/New Files
1. pyproject.toml (Updated - added prometheus_client and structlog)
# pyproject.toml
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "recursive-oversight"
version = "0.1.0"
authors =
description = "A Python package for Recursive Governance Oversight
connectors and loops."
readme = "README.md"
requires-python = ">=3.9"
classifiers =
dependencies =
[project.urls]
"Homepage" = "https://github.com/your-org/recursive-oversight" #
Replace with your repo
"Bug Tracker" =
"https://github.com/your-org/recursive-oversight/issues" # Replace
[options]
packages = find:
install_requires =
SQLAlchemy~=2.0
psycopg2-binary~=2.9
requests~=2.31
boto3~=1.34
snowflake-connector-python~=3.6
prometheus_client~=0.19
structlog~=24.2
python_requires = >=3.9
[options.packages.find]
exclude =
tests*
docs*
# Logging Configuration
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
logger = logging.getLogger(__name__)
class OversightLedger:
"""
A central ledger to store all loop outputs.
In this version, it persists entries to S3 as JSON-lines files,
partitioned by date and loop type for auditability and analytics.
"""
def __init__(self, s3_bucket: str = LEDGER_S3_BUCKET, s3_prefix:
str = LEDGER_S3_PREFIX):
self.s3_bucket = s3_bucket
self.s3_prefix = s3_prefix
self.s3_client = boto3.client('s3', region_name=AWS_REGION)
logger.info(f"OversightLedger initialized to S3:
s3://{self.s3_bucket}/{self.s3_prefix}")
s3_key = (
f"{self.s3_prefix}/"
f"year={timestamp_dt.year}/"
f"month={timestamp_dt.month:02d}/"
f"day={timestamp_dt.day:02d}/"
f"loop={entry.get('loop', 'unknown_loop')}/"
f"org={org_key_for_path}/"
f"{entry_id}.json"
)
try:
self.s3_client.put_object(
Bucket=self.s3_bucket,
Key=s3_key,
Body=json.dumps(entry, indent=2).encode('utf-8'),
ContentType='application/json'
)
logger.info(f"Ledger entry for loop '{entry.get('loop')}'
key '{entry.get('key')}' recorded to S3:
s3://{self.s3_bucket}/{s3_key}")
except Exception as e:
logger.error(f"Failed to record ledger entry to S3 for
loop '{entry.get('loop')}' key '{entry.get('key')}': {e}",
exc_info=True)
# In a production system, consider a dead-letter queue or
retry mechanism here.
def clear(self):
logger.warning("Clear method is for development/testing only.
S3 objects are immutable and not 'cleared' this way.")
# In a real S3 ledger, you'd manage object lifecycle policies
or run a cleanup job.
logger = logging.getLogger(__name__)
class JudgmentLevel(Enum):
"""
Defines the gradient judgment levels for oversight outcomes.
Values are ordered, allowing for comparison.
"""
FAIL = 1 # Score < 0.5
WARNING = 2 # 0.5 <= Score < 0.75
PASS = 3 # Score >= 0.75
class Evaluator:
"""
Evaluates multiple metrics, applies weights, calculates a total
score,
and maps the score to a gradient judgment level.
"""
def __init__(
self,
metric_weights: Dict[str, float],
judgment_ranges: Dict]
):
"""
Initializes the Evaluator with metric weights and judgment
ranges.
Args:
metric_weights (Dict[str, float]): A dictionary mapping
metric names to their weights.
Weights should sum to
1.0 if normalized, or be relative.
judgment_ranges (Dict]): A dictionary defining
self.metric_weights = metric_weights
self.judgment_ranges = judgment_ranges
logger.info(f"Evaluator initialized with weights:
{self.metric_weights} and ranges: {self.judgment_ranges}")
individual_evals[metric_name] = MetricEvaluation(
metric_name=metric_name,
value=value,
threshold=threshold,
meets_threshold=meets_threshold
)
if sum_of_actual_weights == 0:
total_score = 0.0 # Avoid division by zero if no valid
metrics or weights
logger.warning("Sum of actual weights is zero. Total score
set to 0.0.")
else:
total_score = total_weighted_sum / sum_of_actual_weights
logger = logging.getLogger(__name__)
def _get_engine():
global _engine
if _engine is None:
try:
_engine = create_engine(DATABASE_URL, pool_size=10,
max_overflow=20)
logger.info("SQLAlchemy engine created for audit
connector.")
except Exception as e:
logger.error(f"Failed to create SQLAlchemy engine for
audit connector: {e}", exc_info=True)
raise DataFetchError("Database connection failed during
engine creation for audit connector.") from e
return _engine
def _get_session_local():
global _SessionLocal
if _SessionLocal is None:
_SessionLocal = sessionmaker(autocommit=False,
autoflush=False, bind=_get_engine())
logger.info("SQLAlchemy sessionmaker created for audit
connector.")
return _SessionLocal
@contextmanager
def get_db_session() -> Session:
session_local_instance = _get_session_local()
session = session_local_instance()
try:
yield session
except Exception as e:
logger.error(f"Audit connector session error: {e}",
exc_info=True)
session.rollback()
raise DataFetchError("Database operation failed in audit
connector.") from e
finally:
session.close()
if result is None:
logger.warning(f"No audit reports found for org_id:
{org_id}.")
return None
timeliness_fraction = float(result)
logger.info(f"Fetched audit_timeliness for org_id
{org_id}: {timeliness_fraction:.2f}")
return timeliness_fraction
except DataFetchError:
raise
except Exception as e:
logger.error(f"Unexpected error in
_sync_fetch_audit_timeliness_logic for org_id {org_id}: {e}",
exc_info=True)
raise DataFetchError(f"Failed to fetch audit_timeliness for
org_id {org_id} due to unexpected error.") from e
logger = logging.getLogger(__name__)
class PublicTrustConnector:
"""
Connector for fetching publictrust_index from an external API.
Implements the Connector protocol.
"""
def __init__(self):
self.s3_client = boto3.client('s3', region_name=AWS_REGION)
async def fetch(self, key: Union[int, str]) -> Optional[float]:
"""
Asynchronously fetches the public trust index for a given
organization.
Expects 'key' to be a string 'org_name'.
Stores raw response in S3.
"""
if not isinstance(key, str):
raise TypeError("PublicTrustConnector expects 'key' to be
a string organization name.")
org_name = key
api_url = f"{PUBLIC_TRUST_API_BASE_URL}/score"
headers = {"Authorization": f"Bearer {PUBLIC_TRUST_API_KEY}"}
params = {"org": org_name}
s3_key =
f"public_trust_api_raw/{org_name}/{current_time_iso}.json"
raw_data = response.json()
score = raw_data.get("score")
if score is None:
logger.warning(f"Public trust index 'score' not
found in API response for {org_name}.")
status = "warning" # Partial success
return None
status = "success"
return float(score)
except requests.exceptions.Timeout:
status = "failure"
logger.error(f"Public trust API request timed out for
{org_name}.", exc_info=True)
raise PublicAPIFetchError(f"Public trust API timeout
for {org_name}")
except requests.exceptions.RequestException as e:
status = "failure"
logger.error(f"Error calling public trust API for
{org_name}: {e}", exc_info=True)
raise PublicAPIFetchError(f"Failed to call public
trust API for {org_name}") from e
except json.JSONDecodeError:
status = "failure"
logger.error(f"Failed to decode JSON response from
public trust API for {org_name}.", exc_info=True)
raise PublicAPIFetchError(f"Invalid JSON from public
trust API for {org_name}")
except Exception as e:
status = "failure"
logger.error(f"Unexpected error fetching public trust
index for {org_name}: {e}", exc_info=True)
raise PublicAPIFetchError(f"Unexpected error for
{org_name}") from e
finally:
PUBLIC_TRUST_FETCH_COUNT.labels(org_name=org_name,
status=status).inc() # Increment counter
logger = logging.getLogger(__name__)
# --- Prometheus Metrics ---
ALIGNMENT_FETCH_TIME = Summary('alignment_fetch_seconds', 'Time to
fetch mission alignment score', ['org_id', 'status'])
ALIGNMENT_FETCH_COUNT = Counter('alignment_fetch_total', 'Total
mission alignment score fetch attempts', ['org_id', 'status'])
class AlignmentConnector:
"""
Connector for fetching missionalignmentscore by invoking an AWS
Lambda function.
Implements the Connector protocol.
"""
def __init__(self):
self.lambda_client = boto3.client('lambda',
region_name=AWS_REGION)
response_payload =
json.loads(response['Payload'].read())
if 'FunctionError' in response:
error_message =
response_payload.get('errorMessage', 'Unknown Lambda error')
error_type = response_payload.get('errorType',
'LambdaInvocationError')
logger.error(f"Lambda function error for org_id
{org_id}: {error_type} - {error_message}")
status = "failure"
raise DerivedMetricError(f"Lambda function error:
{error_type} - {error_message}")
score = response_payload.get("score")
if score is None:
logger.warning(f"Lambda did not return 'score' for
org_id: {org_id}. Response: {response_payload}")
status = "warning" # Partial success
return None
status = "success"
return float(score)
except
self.lambda_client.exceptions.ResourceNotFoundException:
status = "failure"
logger.error(f"Lambda function
'{AWS_LAMBDA_FUNCTION_NAME}' not found.", exc_info=True)
raise DerivedMetricError(f"Lambda function
'{AWS_LAMBDA_FUNCTION_NAME}' not found")
except json.JSONDecodeError:
status = "failure"
logger.error(f"Failed to decode JSON response from
Lambda for org_id {org_id}.", exc_info=True)
raise DerivedMetricError(f"Invalid JSON from Lambda
for org_id {org_id}")
except Exception as e:
status = "failure"
logger.error(f"Unexpected error invoking Lambda for
org_id {org_id}: {e}", exc_info=True)
raise DerivedMetricError(f"Failed to get mission
alignment score for org_id {org_id}") from e
finally:
ALIGNMENT_FETCH_COUNT.labels(org_id=org_id,
status=status).inc() # Increment counter
class OversightLoop:
"""
Base class encapsulating one oversight cycle (Micro, Meso, or
Macro).
It fetches metrics, evaluates them using an Evaluator, logs to a
ledger,
adjusts its criteria, and can trigger a subsequent loop.
"""
def __init__(
self,
name: str,
connector: Connector,
metrics_to_fetch: List[str], # NEW: List of metrics this loop
will fetch
evaluator: Evaluator, # NEW: Injected Evaluator instance
criteria: Dict[str, float], # Example: {"metric_name_1": 0.9,
"metric_name_2": 0.5}
next_loop: Optional["OversightLoop"],
ledger: OversightLedger
):
self.name = name
self.connector = connector
self.metrics_to_fetch = metrics_to_fetch # Store which metrics
this loop is responsible for
self.evaluator = evaluator
self.criteria = criteria # Now holds thresholds for multiple
metrics
self.next_loop = next_loop
self.ledger = ledger
logger.info("OversightLoop initialized", loop_name=self.name,
criteria=self.criteria)
async def run_cycle(self, key: Union[int, str], run_id: str) ->
None:
"""
Executes one full cycle of the oversight loop for a given
organizational key.
Includes structured logging with correlation IDs.
"""
# Bind correlation IDs to the logger for this specific run
bound_logger = logger.bind(org_key=key, loop_name=self.name,
run_id=run_id)
bound_logger.info("Starting oversight cycle")
with LOOP_RUN_TIME.labels(loop_name=self.name,
status='success').time(): # Time successful runs
try:
# 1) Fetch metrics using the injected connector
# Assuming connector.fetch can handle fetching
multiple metrics or is called per metric
# For this prototype, connector.fetch is designed for
a single metric.
# In a multi-metric loop, you'd iterate through
metrics_to_fetch and call connector.fetch for each.
# For now, we'll adapt to the single-metric connector
for simplicity,
# assuming each loop focuses on one primary metric for
its evaluation.
# The `metrics_to_fetch` list will be used to define
the `criteria` for the evaluator.
judgment=overall_evaluation.judgment.name,
individual_metrics={k:
v.meets_threshold for k, v in
overall_evaluation.individual_metrics.items()})
loop_status = "success"
except Exception as e:
bound_logger.error("Error during loop execution",
error=str(e), exc_info=True)
loop_status = "failure"
# Decide on fallback behavior here. For now, we log
and proceed to log the error.
finally:
# 3) Log to ledger
entry = {
"loop": self.name,
"key": key,
"run_id": run_id, # Include correlation ID
"overall_evaluation":
overall_evaluation.total_score if overall_evaluation else None,
"judgment": overall_evaluation.judgment.name if
overall_evaluation else "ERROR",
"individual_metrics_eval": {
name: {
"value": eval_res.value,
"threshold": eval_res.threshold,
"meets_threshold":
eval_res.meets_threshold
} for name, eval_res in
overall_evaluation.individual_metrics.items()
} if overall_evaluation else {},
"loop_status": loop_status,
"error_message": str(e) if loop_status ==
"failure" else None,
"timestamp":
datetime.datetime.now(datetime.timezone.utc).isoformat(timespec='milli
seconds') + 'Z', # ISO 8601 UTC
}
self.ledger.record(entry)
bound_logger.debug("Ledger entry recorded.")
if primary_metric_name:
self.criteria[primary_metric_name] =
round(new_thresh, 4)
bound_logger.info("Criteria adjusted",
new_threshold=self.criteria[primary_metric_name])
else:
bound_logger.warning("No primary metric to
adjust criteria for.")
else:
bound_logger.warning("Criteria not adjusted due to
error or no evaluation.")
# Set up basic logging for the DAG, to see output in Airflow logs
# In a production Airflow environment, this is typically managed by
Airflow's logging config.
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Instantiate connectors
audit_connector = AuditConnector()
public_trust_connector = PublicTrustConnector()
alignment_connector = AlignmentConnector()
meso_loop = OversightLoop(
name="MesoLoop",
connector=public_trust_connector, # PublicTrustConnector fetches
publictrust_index
metrics_to_fetch=["public_trust_index"],
evaluator=shared_evaluator,
criteria={"public_trust_index": 70.0}, # Initial threshold for
this metric
next_loop=macro_loop, # Meso triggers Macro
ledger=shared_ledger
)
micro_loop = OversightLoop(
name="MicroLoop",
connector=audit_connector, # AuditConnector fetches
audit_timeliness
metrics_to_fetch=["audit_timeliness"],
evaluator=shared_evaluator,
criteria={"audit_timeliness": 0.9}, # Initial threshold for this
metric
next_loop=meso_loop, # Micro triggers Meso
ledger=shared_ledger
)
@task
def list_organizations() -> list[int]:
"""
Simulates fetching a dynamic list of organization IDs from a
source.
In production, this would query a database (Postgres/Snowflake) or
an API.
"""
logger.info("Fetching list of organizations for oversight.")
# Example: In a real scenario, this would be a DB query:
# from sqlalchemy import create_engine, text
# engine = create_engine(DATABASE_URL)
# with engine.connect() as connection:
# result = connection.execute(text("SELECT id FROM
organizations;")).fetchall()
# org_ids = [row for row in result]
# return org_ids
@task
def run_full_oversight_cycle_for_org(org_id: int, dag_run_id: str):
"""
Entry point for Airflow. Runs the full recursive oversight cycle
for a single organization.
Uses asyncio.run() to execute the async loop chain.
"""
# Bind correlation IDs to the logger for this specific task
instance
# This ensures logs from this task are correlated with the DAG run
and organization.
import structlog
structlog.configure(
processors=,
logger_factory=structlog.stdlib.LoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
cache_logger_on_first_use=True,
)
task_logger = structlog.get_logger(__name__).bind(org_id=org_id,
dag_run_id=dag_run_id)
task_logger.info("Starting full oversight cycle for organization")
try:
# The micro_loop will trigger meso_loop, which will trigger
macro_loop
asyncio.run(micro_loop.run_cycle(org_id, dag_run_id))
task_logger.info("Completed full oversight cycle for
organization")
except Exception as e:
task_logger.error("Failed to complete oversight cycle for
organization", error=str(e), exc_info=True)
raise # Re-raise to let Airflow mark the task as failed
Summary of Changes:
1. oversight/evaluator.py (NEW):
○ Introduces JudgmentLevel Enum (FAIL, WARNING, PASS) for gradient judgments.
○ Defines MetricEvaluation and OverallEvaluation dataclasses for structured results.
○ Implements the Evaluator class, which takes metric_weights and judgment_ranges.
○ The evaluate method calculates a weighted total score from multiple metrics and
maps it to a JudgmentLevel.
2. oversight/ledger.py (UPDATED):
○ The OversightLedger now uses boto3 to write each ledger entry as a separate
JSON file to an S3 bucket.
○ Entries are partitioned by year/month/day/loop_name/org_key/entry_id.json for
efficient querying via external tools.
○ Timestamps are strictly ISO 8601 UTC with millisecond precision.
○ A unique entry_id (UUID) is added to each record.
3. oversight/connectors/*.py (UPDATED):
○ audit.py, public_trust.py, and derived.py now include Prometheus Summary (for
timing) and Counter (for success/failure) metrics.
○ These metrics are labeled with relevant context (e.g., org_id, org_name, status).
4. oversight/loops/base.py (UPDATED):
○ The OversightLoop now takes an Evaluator instance, metrics_to_fetch (list of metric
names), and criteria (dictionary of thresholds for multiple metrics).
○ It uses the injected Evaluator to determine the OverallEvaluation and
JudgmentLevel.
○ Prometheus Summary and Counter metrics are added for loop execution time,
count, and judgment outcomes.
○ Structured Logging: structlog is configured to emit JSON logs, and org_key and
run_id are bound to the logger context for end-to-end traceability.
○ The adjust logic is simplified to adjust the primary metric's threshold based on the
overall judgment.
5. dags/recursive_oversight_dag.py (UPDATED):
○ The DAG ID is updated to recursive_oversight_cycle_v2.
○ A new @task list_organizations is introduced to simulate fetching a dynamic list of
organizations.
○ The run_full_oversight_cycle_for_org is now a @task decorated function.
○ Dynamic Task Mapping: The run_full_oversight_cycle_for_org task
uses .partial(dag_run_id="{{ dag_run.run_id }}").expand(org_id=org_list_task) to
dynamically create a separate task instance for each organization returned by
list_organizations. This ensures scalability and granular visibility in Airflow.
○ The dag_run.run_id is passed as a correlation ID to the individual task runs.
This implementation significantly advances the Recursive Oversight platform towards a
production-ready state, embodying the principles of resilient, ethical, and observable systems.
Next Steps:
1. Deploy and Configure:
○ Ensure your Airflow environment has the recursive-oversight package installed (pip
install. from the root).
○ Set up the necessary environment variables (from oversight/config.py) in your
Airflow deployment (e.g., DATABASE_URL, AWS credentials, S3 bucket names,
Lambda function names).
○ Ensure your AWS credentials are configured for boto3 (e.g., via IAM roles for
Airflow workers).
○ Deploy the updated DAG file (dags/recursive_oversight_dag.py) to your Airflow
DAGs folder.
2. Monitor and Validate:
○ Trigger the recursive_oversight_cycle_v2 DAG in Airflow.
○ Observe the Airflow UI for dynamically mapped tasks.
○ Check Airflow task logs for structured JSON output and correlation IDs.
○ Verify that JSON files are being written to your specified S3 ledger bucket, correctly
partitioned.
○ If you have a Prometheus setup, ensure the new metrics are being scraped and are
visible.
3. Implement Mock Services (for testing): For public_trust_connector and
alignment_connector, you'll need either actual deployed (even dummy) external
APIs/Lambda functions or robust mocks (e.g., using moto for AWS services) to allow the
connectors to run successfully during testing.
We've wired justice. Let's see it flow.