Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions configs/config.default.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ Watchman:
Download:
RefreshInterval: "12h"
InitialDataDirectory: ""
IncludedLists:
- "us_csl"
- "us_ofac"

# Specify which lists to download and include in Watchman results
# Examples: us_csl, us_ofac, us_non_sdn
IncludedLists: []

Search:
# Tune these settings based on your available resources (CPUs, etc).
Expand Down
76 changes: 44 additions & 32 deletions docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ Watchman:
Download:
RefreshInterval: "12h"
InitialDataDirectory: ""

# Specify which lists to download and include in Watchman results
# Examples: us_csl, us_ofac, us_non_sdn
IncludedLists:
- "us_csl"
- "us_ofac"
Expand Down Expand Up @@ -71,46 +74,55 @@ PostalPool is an experiment for improving address parsing. It's optional configu
CGOSelfInstances: 1
```

### Included Lists

| List ID | Name | Source |
|--------------|--------------------------------------------|----------------------------------------------------------|
| `us_ofac` | US Office of Foreign Assets Control (OFAC) | [URL](https://ofac.treasury.gov/sanctions-list-service) |
| `us_non_sdn` | US Office of Foreign Assets Control (OFAC) | [URL](https://ofac.treasury.gov/sanctions-list-service) |
| `us_csl` | Consolidated Screening List (CSL) | [URL](https://www.trade.gov/consolidated-screening-list) |

### Environment Variables

| Environmental Variable | Description | Default |
|-----|-----|-----|
| `DATA_REFRESH_INTERVAL` | Interval for data redownload and reparse. `off` disables this refreshing. | 12h |
| `INITIAL_DATA_DIRECTORY` | Directory filepath with initial files to use instead of downloading. Periodic downloads will replace the initial files. | Empty |
| `HTTPS_CERT_FILE` | Filepath containing a certificate (or intermediate chain) to be served by the HTTP server. Requires all traffic be over secure HTTP. | Empty |
| `HTTPS_KEY_FILE` | Filepath of a private key matching the leaf certificate from `HTTPS_CERT_FILE`. | Empty |
| `LOG_FORMAT` | Format for logging lines to be written as. | Options: `json`, `plain` - Default: `plain` |
| `LOG_LEVEL` | Level of logging to emit. | Options: `trace`, `info` - Default: `info` |
| Environmental Variable | Description | Default |
|--------------------------|--------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------|
| `INCLUDED_LISTS` | Comma separated list of lists to include. | Empty |
| `DATA_REFRESH_INTERVAL` | Interval for data redownload and reparse. `off` disables this refreshing. | 12h |
| `INITIAL_DATA_DIRECTORY` | Directory filepath with initial files to use instead of downloading. Periodic downloads will replace the initial files. | Empty |
| `HTTPS_CERT_FILE` | Filepath containing a certificate (or intermediate chain) to be served by the HTTP server. Requires all traffic be over secure HTTP. | Empty |
| `HTTPS_KEY_FILE` | Filepath of a private key matching the leaf certificate from `HTTPS_CERT_FILE`. | Empty |
| `LOG_FORMAT` | Format for logging lines to be written as. | Options: `json`, `plain` - Default: `plain` |
| `LOG_LEVEL` | Level of logging to emit. | Options: `trace`, `info` - Default: `info` |

### Similarity Configuration

| Environmental Variable | Description | Default |
|-----|-----|-----|
| `SEARCH_GOROUTINE_COUNT` | Set a fixed number of goroutines used for each search. Default is to dynamically optimize for faster results. | Empty |
| `KEEP_STOPWORDS` | Boolean to keep stopwords in names. | `false` |
| `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 |
| `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 |
| `LENGTH_DIFFERENCE_CUTOFF_FACTOR` | Minimum ratio for the length of two matching tokens, before they score is penalised. | 0.9 |
| `LENGTH_DIFFERENCE_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens have different lengths. | 0.3 |
| `DIFFERENT_LETTER_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens begin with different letters. | 0.9 |
| `UNMATCHED_INDEX_TOKEN_WEIGHT` | Weight of penalty applied to scores when part of the indexed name isn't matched. | 0.15 |
| `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 |
| `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 |
| `DISABLE_PHONETIC_FILTERING` | Force scoring search terms against every indexed record. | `false` |
| Environmental Variable | Description | Default |
|------------------------------------|---------------------------------------------------------------------------------------------------------------|---------|
| `SEARCH_GOROUTINE_COUNT` | Set a fixed number of goroutines used for each search. Default is to dynamically optimize for faster results. | Empty |
| `KEEP_STOPWORDS` | Boolean to keep stopwords in names. | `false` |
| `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 |
| `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 |
| `LENGTH_DIFFERENCE_CUTOFF_FACTOR` | Minimum ratio for the length of two matching tokens, before they score is penalised. | 0.9 |
| `LENGTH_DIFFERENCE_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens have different lengths. | 0.3 |
| `DIFFERENT_LETTER_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens begin with different letters. | 0.9 |
| `UNMATCHED_INDEX_TOKEN_WEIGHT` | Weight of penalty applied to scores when part of the indexed name isn't matched. | 0.15 |
| `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 |
| `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 |
| `DISABLE_PHONETIC_FILTERING` | Force scoring search terms against every indexed record. | `false` |

#### Source List Configuration

| Environmental Variable | Description | Default |
|-----|-----|-----|
| `DOWNLOAD_TIMEOUT` | Duration of time allowed for a list to fully download. | `45s` |
| `OFAC_DOWNLOAD_TEMPLATE` | HTTP address for downloading raw OFAC files. | `https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/%s` |
| `EU_CSL_TOKEN` | Token used to download the EU Consolidated Screening List | `<valid-token>` |
| `EU_CSL_DOWNLOAD_URL` | Use an alternate URL for downloading EU Consolidated Screening List | Subresource of `webgate.ec.europa.eu` |
| `UK_CSL_DOWNLOAD_URL` | Use an alternate URL for downloading UK Consolidated Screening List | Subresource of `www.gov.uk` |
| `UK_SANCTIONS_LIST_URL` | Use an alternate URL for downloading UK Sanctions List | Subresource of `www.gov.uk` |
| `WITH_UK_SANCTIONS_LIST` | Download and parse the UK Sanctions List on startup. | Default: `false` |
| `US_CSL_DOWNLOAD_URL` | Use an alternate URL for downloading US Consolidated Screening List | Subresource of `api.trade.gov` |
| `CSL_DOWNLOAD_TEMPLATE` | Same as `US_CSL_DOWNLOAD_URL` | |
| Environmental Variable | Description | Default |
|--------------------------|---------------------------------------------------------------------|---------------------------------------------------------------------------------|
| `DOWNLOAD_TIMEOUT` | Duration of time allowed for a list to fully download. | `45s` |
| `OFAC_DOWNLOAD_TEMPLATE` | HTTP address for downloading raw OFAC files. | `https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/%s` |
| `EU_CSL_TOKEN` | Token used to download the EU Consolidated Screening List | `<valid-token>` |
| `EU_CSL_DOWNLOAD_URL` | Use an alternate URL for downloading EU Consolidated Screening List | Subresource of `webgate.ec.europa.eu` |
| `UK_CSL_DOWNLOAD_URL` | Use an alternate URL for downloading UK Consolidated Screening List | Subresource of `www.gov.uk` |
| `UK_SANCTIONS_LIST_URL` | Use an alternate URL for downloading UK Sanctions List | Subresource of `www.gov.uk` |
| `WITH_UK_SANCTIONS_LIST` | Download and parse the UK Sanctions List on startup. | Default: `false` |
| `US_CSL_DOWNLOAD_URL` | Use an alternate URL for downloading US Consolidated Screening List | Subresource of `api.trade.gov` |
| `CSL_DOWNLOAD_TEMPLATE` | Same as `US_CSL_DOWNLOAD_URL` | |

## Data persistence

Expand Down
18 changes: 9 additions & 9 deletions docs/intro.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,24 +27,24 @@ Watchman is a robust compliance screening tool that provides:
- Native Go library for direct implementation
- Webhook notifications for automated workflows

## Included Lists
## Included List Sources

Watchman integrates the following lists to help you maintain global compliance:

| Source | List |
|--------|------|
| US Treasury | [Office of Foreign Assets Control (OFAC)](https://ofac.treasury.gov/sanctions-list-service) |
| US Government | [Consolidated Screening List (CSL)](https://www.trade.gov/consolidated-screening-list) |
| Source | List |
|---------------|---------------------------------------------------------------------------------------------------------|
| US Treasury | [Office of Foreign Assets Control (OFAC)](https://ofac.treasury.gov/sanctions-list-service) and Non-SDN |
| US Government | [Consolidated Screening List (CSL)](https://www.trade.gov/consolidated-screening-list) |

### Future Lists

The v0.50+ series of Watchman has revamped its search engine. The following lists are being re-added into Watchman.

| Source | List |
|--------|------|
| Source | List |
|----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| European Union | [Consolidated Sanctions List](https://data.europa.eu/data/datasets/consolidated-list-of-persons-groups-and-entities-subject-to-eu-financial-sanctions?locale=en) |
| United Kingdom | [OFSI Sanctions List](https://www.gov.uk/government/publications/financial-sanctions-consolidated-list-of-targets/consolidated-list-of-targets#contents) |
| United Kingdom | [Sanctions List](https://www.gov.uk/government/publications/the-uk-sanctions-list) (Disabled by default) |
| United Kingdom | [OFSI Sanctions List](https://www.gov.uk/government/publications/financial-sanctions-consolidated-list-of-targets/consolidated-list-of-targets#contents) |
| United Kingdom | [Sanctions List](https://www.gov.uk/government/publications/the-uk-sanctions-list) (Disabled by default) |

## Search Methodology

Expand Down
48 changes: 37 additions & 11 deletions internal/download/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"os"
"path/filepath"
"slices"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -41,6 +42,8 @@ func (dl *downloader) RefreshAll(ctx context.Context) (Stats, error) {
ctx, span := telemetry.StartSpan(ctx, "refresh-all")
defer span.End()

start := time.Now()

stats := Stats{
Lists: make(map[string]int),
ListHashes: make(map[string]string),
Expand All @@ -49,8 +52,6 @@ func (dl *downloader) RefreshAll(ctx context.Context) (Stats, error) {
logger := dl.logger.Info().With(log.Fields{
"initial_data_directory": log.String(expandInitialDir(initialDataDirectory(dl.conf))),
})
start := time.Now()
logger.Info().Log("starting list refresh")

g, ctx := errgroup.WithContext(ctx)
preparedLists := make(chan preparedList, 10)
Expand All @@ -73,12 +74,18 @@ func (dl *downloader) RefreshAll(ctx context.Context) (Stats, error) {
// Create a WaitGroup to track all producers
var producerWg sync.WaitGroup

// Track what lists have been requested
var requestedLists []search.SourceList
// Track what lists have been requested and loaded
requestedLists := getIncludedLists(dl.conf.IncludedLists)
var listsLoaded []search.SourceList

if len(requestedLists) == 0 {
logger.Warn().Log("no lists have been configured!")
}
logger.Info().Logf("starting list refresh of %v", requestedLists)

// OFAC Records
if slices.Contains(dl.conf.IncludedLists, search.SourceUSOFAC) {
requestedLists = append(requestedLists, search.SourceUSOFAC)
if slices.Contains(requestedLists, search.SourceUSOFAC) {
listsLoaded = append(listsLoaded, search.SourceUSOFAC)

producerWg.Add(1)
g.Go(func() error {
Expand All @@ -92,8 +99,8 @@ func (dl *downloader) RefreshAll(ctx context.Context) (Stats, error) {
}

// CSL Records
if slices.Contains(dl.conf.IncludedLists, search.SourceUSCSL) {
requestedLists = append(requestedLists, search.SourceUSCSL)
if slices.Contains(requestedLists, search.SourceUSCSL) {
listsLoaded = append(listsLoaded, search.SourceUSCSL)

producerWg.Add(1)
g.Go(func() error {
Expand All @@ -108,12 +115,12 @@ func (dl *downloader) RefreshAll(ctx context.Context) (Stats, error) {

// Compare the configured lists against those we actually loaded.
// Any extra lists are an error as we don't want to silently ignore them.
if len(requestedLists) > len(dl.conf.IncludedLists) {
if len(listsLoaded) > len(requestedLists) {
close(preparedLists)

return stats, fmt.Errorf("loaded extra lists: %#v loaded compared to %#v configured", requestedLists, dl.conf.IncludedLists)
return stats, fmt.Errorf("loaded extra lists: %#v loaded compared to %#v configured", listsLoaded, requestedLists)
}
if extra := findExtraLists(dl.conf.IncludedLists, requestedLists); extra != "" {
if extra := findExtraLists(requestedLists, listsLoaded); extra != "" {
close(preparedLists)

return stats, fmt.Errorf("unknown lists: %v", extra)
Expand Down Expand Up @@ -141,6 +148,25 @@ func (dl *downloader) RefreshAll(ctx context.Context) (Stats, error) {
return stats, nil
}

func getIncludedLists(configured []search.SourceList) []search.SourceList {
out := make([]search.SourceList, 0, len(configured))
out = append(out, configured...)

fromEnvStr := strings.TrimSpace(os.Getenv("INCLUDED_LISTS"))
if fromEnvStr != "" {
for _, v := range strings.Split(fromEnvStr, ",") {
list := strings.ToLower(strings.TrimSpace(v))
if list != "" {
out = append(out, search.SourceList(list))
}
}
}

slices.Sort(out)

return slices.Compact(out)
}

func findExtraLists(config, loaded []search.SourceList) string {
var extra []search.SourceList

Expand Down
Loading