Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ func filterFunction(c *cli.Context) error {
batcher := helpers.BuildBatcherFromArguments(c)
extractor := helpers.BuildExtractorFromArgumentsEx(c, batcher, "\t")

readChan := extractor.ReadChan()
readChan := extractor.ReadFull()
OUTER_LOOP:
for {
matchBatch, more := <-readChan
Expand Down
10 changes: 6 additions & 4 deletions cmd/helpers/updatingAggregator.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ import (

// RunAggregationLoop is a helper that takes care of output sync
// And the main async loops for you, it has two inputs (in addition to the extractor)
// matchProcessor - to process a match
// writeOutput - triggered after a delay, only if there's an update
//
// matchProcessor - to process a match
// writeOutput - triggered after a delay, only if there's an update
//
// The two functions are guaranteed to never happen at the same time
func RunAggregationLoop(ext *extractor.Extractor, aggregator aggregation.Aggregator, writeOutput func()) {
logger.DeferLogs()
Expand All @@ -39,7 +41,7 @@ func RunAggregationLoop(ext *extractor.Extractor, aggregator aggregation.Aggrega
// Processing data from extractor
exitSignal := make(chan os.Signal, 1)
signal.Notify(exitSignal, os.Interrupt)
reader := ext.ReadChan()
reader := ext.ReadSimple()
PROCESSING_LOOP:
for {
select {
Expand All @@ -51,7 +53,7 @@ PROCESSING_LOOP:
}
outputMutex.Lock()
for _, match := range matchBatch {
aggregator.Sample(match.Extracted)
aggregator.Sample(match)
}
outputMutex.Unlock()
}
Expand Down
273 changes: 175 additions & 98 deletions pkg/extractor/extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@ import (
"unsafe"
)

/*
Extractor is the main working pool that runs all CPU-intensive tasks (matcher and expressions)
Because of this, this area is extra-optimized, and so looks a little funny

Most aggregators will use ReadSimple(), which only returns the matches, and is significantly
faster than its ReadFull counterpart. Because it's so sensitive, the code path is provided twice
rather than made generic
*/

// BString a []byte representation of a string (used for performance over string-copies)
type BString []byte

Expand Down Expand Up @@ -37,24 +46,26 @@ type Config struct {
Ignore IgnoreSet // Ignore these truthy expressions
}

func (s *Config) getWorkerCount() int {
if s.Workers <= 0 {
return 2
}
return s.Workers
}

// Extractor is the representation of the reader
//
// Expects someone to consume its ReadChan()
type Extractor struct {
readChan chan []Match
matcherFactory matchers.Factory
readLines uint64
matchedLines uint64
ignoredLines uint64
config Config
keyBuilder *expressions.CompiledKeyBuilder
ignore IgnoreSet
}
config *Config
keyBuilder *expressions.CompiledKeyBuilder
ignore IgnoreSet

readLines uint64
matchedLines uint64
ignoredLines uint64

type extractorInstance struct {
*Extractor
matcher matchers.Matcher
context *SliceSpaceExpressionContext
input <-chan InputBatch
}

func (s *Extractor) ReadLines() uint64 {
Expand All @@ -69,119 +80,185 @@ func (s *Extractor) IgnoredLines() uint64 {
return atomic.LoadUint64(&s.ignoredLines)
}

func (s *Extractor) ReadChan() <-chan []Match {
return s.readChan
func New(inputBatch <-chan InputBatch, config *Config) (*Extractor, error) {
kb, err := funclib.NewKeyBuilder().Compile(config.Extract)
if err != nil {
return nil, err
}

ext := &Extractor{
config: config,
keyBuilder: kb,
ignore: config.Ignore,
input: inputBatch,
}

return ext, nil
}

// async safe
func (s *extractorInstance) processLineSync(source string, lineNum uint64, line BString) (Match, bool) {
atomic.AddUint64(&s.readLines, 1)
matches := s.matcher.FindSubmatchIndex(line)

// Extract and forward to the ReadChan if there are matches
if len(matches) > 0 {
// Speed is more important here than safety
// By default, casting to string will copy() data from bytes to
// a string instance, but we can safely point to the existing bytes
// as a pointer instead
lineStringPtr := *(*string)(unsafe.Pointer(&line))
// A context is created for each "instance", and since a context isn't shared beyond building a key
// it's significantly faster to reuse a single context per goroutine
expContext := s.context
expContext.linePtr = lineStringPtr
expContext.indices = matches
expContext.source = source
expContext.lineNum = lineNum
if s.ignore == nil || !s.ignore.IgnoreMatch(expContext) {
extractedKey := s.keyBuilder.BuildKey(expContext)

if len(extractedKey) > 0 {
atomic.AddUint64(&s.matchedLines, 1)
return Match{
bLine: line, // Need to keep around what lineStringPtr is pointing to
Line: lineStringPtr,
Indices: matches,
Extracted: extractedKey,
LineNumber: lineNum,
Source: source,
}, true
func (s *Extractor) workerFull(output chan<- []Match) {
matcher := s.config.Matcher.CreateInstance()
exprCtx := &SliceSpaceExpressionContext{
nameTable: matcher.SubexpNameTable(),
}

for batch := range s.input {
var (
matchBatch []Match
ignoredCount int = 0
)

// setup
atomic.AddUint64(&s.readLines, uint64(len(batch.Batch)))
exprCtx.source = batch.Source

// Process each line
for idx, line := range batch.Batch {
matches := matcher.FindSubmatchIndex(line)

if len(matches) > 0 {
// Speed is more important here than safety
// By default, casting to string will copy() data from bytes to
// a string instance, but we can safely point to the existing bytes
// as a pointer instead
lineStringPtr := *(*string)(unsafe.Pointer(&line))

// A context is created for each "instance", and since a context isn't shared beyond building a key
// it's significantly faster to reuse a single context per goroutine
exprCtx.linePtr = lineStringPtr
exprCtx.indices = matches
exprCtx.lineNum = batch.BatchStart + uint64(idx)

// Check ignore, if possible
if s.ignore == nil || !s.ignore.IgnoreMatch(exprCtx) {
extractedKey := s.keyBuilder.BuildKey(exprCtx)

// Extracted a key
if len(extractedKey) > 0 {
if matchBatch == nil {
matchBatch = make([]Match, 0, len(batch.Batch))
}

matchBatch = append(matchBatch, Match{
bLine: line,
Line: lineStringPtr,
Indices: matches,
Extracted: extractedKey,
LineNumber: exprCtx.lineNum,
Source: batch.Source,
})
} else {
ignoredCount++
}

} else {
ignoredCount++
}
}
}

if ignoredCount > 0 {
atomic.AddUint64(&s.ignoredLines, uint64(ignoredCount))
}

atomic.AddUint64(&s.ignoredLines, 1)
} else {
atomic.AddUint64(&s.ignoredLines, 1)
// Emit batch if there is data
if len(matchBatch) > 0 {
atomic.AddUint64(&s.matchedLines, uint64(len(matchBatch)))
output <- matchBatch
}
}
return Match{}, false
}

func (s *Extractor) asyncWorker(wg *sync.WaitGroup, inputBatch <-chan InputBatch) {
defer wg.Done()
// Read full match definitions, at the cost of lower performance (about 15% slower clock time / cpu time)
func (s *Extractor) ReadFull() <-chan []Match {
return startWorkers(s.config.getWorkerCount(), s.workerFull)
}

matcher := s.matcherFactory.CreateInstance()
si := extractorInstance{
Extractor: s,
matcher: matcher,
context: &SliceSpaceExpressionContext{
nameTable: matcher.SubexpNameTable(),
},
func (s *Extractor) workerSimple(output chan<- []string) {
matcher := s.config.Matcher.CreateInstance()
exprCtx := &SliceSpaceExpressionContext{
nameTable: matcher.SubexpNameTable(),
}

for {
batch, more := <-inputBatch
if !more {
break
}
for batch := range s.input {
var (
matchBatch []string
ignoredCount int = 0
)

// setup
atomic.AddUint64(&s.readLines, uint64(len(batch.Batch)))
exprCtx.source = batch.Source

var matchBatch []Match
for idx, str := range batch.Batch {
if match, ok := si.processLineSync(batch.Source, batch.BatchStart+uint64(idx), str); ok {
if matchBatch == nil {
// Initialize to expected cap (only if we have any matches)
matchBatch = make([]Match, 0, len(batch.Batch))
// Process each line
for idx, line := range batch.Batch {
matches := matcher.FindSubmatchIndex(line)

if len(matches) > 0 {
// Speed is more important here than safety
// By default, casting to string will copy() data from bytes to
// a string instance, but we can safely point to the existing bytes
// as a pointer instead
lineStringPtr := *(*string)(unsafe.Pointer(&line))

// A context is created for each "instance", and since a context isn't shared beyond building a key
// it's significantly faster to reuse a single context per goroutine
exprCtx.linePtr = lineStringPtr
exprCtx.indices = matches
exprCtx.lineNum = batch.BatchStart + uint64(idx)

// Check ignore, if possible
if s.ignore == nil || !s.ignore.IgnoreMatch(exprCtx) {
extractedKey := s.keyBuilder.BuildKey(exprCtx)

// Extracted a key
if len(extractedKey) > 0 {
if matchBatch == nil {
matchBatch = make([]string, 0, len(batch.Batch))
}
matchBatch = append(matchBatch, extractedKey)
} else {
ignoredCount++
}

} else {
ignoredCount++
}
matchBatch = append(matchBatch, match)
}
}

if ignoredCount > 0 {
atomic.AddUint64(&s.ignoredLines, uint64(ignoredCount))
}

// Emit batch if there is data
if len(matchBatch) > 0 {
s.readChan <- matchBatch
atomic.AddUint64(&s.matchedLines, uint64(len(matchBatch)))
output <- matchBatch
}
}
}

// New an extractor from an input channel
func New(inputBatch <-chan InputBatch, config *Config) (*Extractor, error) {
compiledExpression, compErr := funclib.NewKeyBuilder().Compile(config.Extract)
if compErr != nil {
return nil, compErr
}

extractor := Extractor{
readChan: make(chan []Match, 5),
matcherFactory: config.Matcher,
keyBuilder: compiledExpression,
config: *config,
ignore: config.Ignore,
}
func (s *Extractor) ReadSimple() <-chan []string {
return startWorkers(s.config.getWorkerCount(), s.workerSimple)
}

func startWorkers[T string | Match](count int, worker func(output chan<- []T)) <-chan []T {
var wg sync.WaitGroup
output := make(chan []T, count*2)

for i := 0; i < config.getWorkerCount(); i++ {
for range count {
wg.Add(1)
go extractor.asyncWorker(&wg, inputBatch)
go func() {
defer wg.Done()
worker(output)
}()
}

go func() {
wg.Wait()
close(extractor.readChan)
close(output)
}()

return &extractor, nil
}

func (s *Config) getWorkerCount() int {
if s.Workers <= 0 {
return 2
}
return s.Workers
return output
}
Loading
Loading