From b91aff340a510a61562aa0cebaac70032ca5c47f Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 4 Jun 2019 14:57:01 +0300 Subject: [PATCH 1/8] Split MultiProcessCollector.__init__ for better profiling Signed-off-by: Aarni Koskela --- prometheus_client/multiprocess.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/prometheus_client/multiprocess.py b/prometheus_client/multiprocess.py index e34ced03..1fa003bb 100644 --- a/prometheus_client/multiprocess.py +++ b/prometheus_client/multiprocess.py @@ -33,6 +33,11 @@ def merge(files, accumulate=True): But if writing the merged data back to mmap files, use accumulate=False to avoid compound accumulation. """ + metrics = MultiProcessCollector._read_metrics(files) + return MultiProcessCollector._accumulate_metrics(metrics, accumulate) + + @staticmethod + def _read_metrics(files): metrics = {} for f in files: parts = os.path.basename(f).split('_') @@ -55,7 +60,10 @@ def merge(files, accumulate=True): # The duplicates and labels are fixed in the next for. metric.add_sample(name, labels_key, value) d.close() + return metrics + @staticmethod + def _accumulate_metrics(metrics, accumulate): for metric in metrics.values(): samples = defaultdict(float) buckets = {} From b325a5e5d67dd05b15c57174ae88fffcee0a3bbb Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 4 Jun 2019 15:46:07 +0300 Subject: [PATCH 2/8] fstat mmap file only once Signed-off-by: Aarni Koskela --- prometheus_client/mmap_dict.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/prometheus_client/mmap_dict.py b/prometheus_client/mmap_dict.py index 679597fa..500f0342 100644 --- a/prometheus_client/mmap_dict.py +++ b/prometheus_client/mmap_dict.py @@ -37,9 +37,11 @@ class MmapedDict(object): def __init__(self, filename, read_mode=False): self._f = open(filename, 'rb' if read_mode else 'a+b') self._fname = filename - if os.fstat(self._f.fileno()).st_size == 0: + capacity = os.fstat(self._f.fileno()).st_size + if capacity == 0: self._f.truncate(_INITIAL_MMAP_SIZE) - self._capacity = os.fstat(self._f.fileno()).st_size + capacity = _INITIAL_MMAP_SIZE + self._capacity = capacity self._m = mmap.mmap(self._f.fileno(), self._capacity, access=mmap.ACCESS_READ if read_mode else mmap.ACCESS_WRITE) From db2a7f0d5e428ba00d274672b182c68d8ba16064 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 4 Jun 2019 15:46:27 +0300 Subject: [PATCH 3/8] Avoid unpack_from() for a simple slice Signed-off-by: Aarni Koskela --- prometheus_client/mmap_dict.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/prometheus_client/mmap_dict.py b/prometheus_client/mmap_dict.py index 500f0342..3327adcd 100644 --- a/prometheus_client/mmap_dict.py +++ b/prometheus_client/mmap_dict.py @@ -81,7 +81,6 @@ def _read_all_values(self): # on every loop iteration used = self._used data = self._m - unpack_from = struct.unpack_from while pos < used: encoded_len = _unpack_integer(data, pos)[0] @@ -90,11 +89,11 @@ def _read_all_values(self): msg = 'Read beyond file size detected, %s is corrupted.' raise RuntimeError(msg % self._fname) pos += 4 - encoded = unpack_from(('%ss' % encoded_len).encode(), data, pos)[0] + encoded_key = data[pos : pos + encoded_len] padded_len = encoded_len + (8 - (encoded_len + 4) % 8) pos += padded_len value = _unpack_double(data, pos)[0] - yield encoded.decode('utf-8'), value, pos + yield encoded_key.decode('utf-8'), value, pos pos += 8 def read_all_values(self): From 9c35671e0f8a31d38af3029e2fa268577995ea7a Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 4 Jun 2019 15:16:15 +0300 Subject: [PATCH 4/8] Avoid duplicate JSON parsing and small allocations Signed-off-by: Aarni Koskela --- prometheus_client/multiprocess.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/prometheus_client/multiprocess.py b/prometheus_client/multiprocess.py index 1fa003bb..e62f9fcd 100644 --- a/prometheus_client/multiprocess.py +++ b/prometheus_client/multiprocess.py @@ -12,6 +12,8 @@ from .samples import Sample from .utils import floatToGoString +MP_METRIC_HELP = 'Multiprocess metric' + class MultiProcessCollector(object): """Collector for files for multi-process mode.""" @@ -39,17 +41,26 @@ def merge(files, accumulate=True): @staticmethod def _read_metrics(files): metrics = {} + key_cache = {} + + def _parse_key(key): + val = key_cache.get(key) + if not val: + metric_name, name, labels = json.loads(key) + labels_key = tuple(sorted(labels.items())) + val = key_cache[key] = (metric_name, name, labels, labels_key) + return val + for f in files: parts = os.path.basename(f).split('_') typ = parts[0] d = MmapedDict(f, read_mode=True) for key, value in d.read_all_values(): - metric_name, name, labels = json.loads(key) - labels_key = tuple(sorted(labels.items())) + metric_name, name, labels, labels_key = _parse_key(key) metric = metrics.get(metric_name) if metric is None: - metric = Metric(metric_name, 'Multiprocess metric', typ) + metric = Metric(metric_name, MP_METRIC_HELP, typ) metrics[metric_name] = metric if typ == 'gauge': From fbed8d1380d89b51ba4c526807fce74eab943407 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 4 Jun 2019 15:56:17 +0300 Subject: [PATCH 5/8] Don't use mmap() when only reading a MmapedDict file Signed-off-by: Aarni Koskela --- prometheus_client/mmap_dict.py | 53 ++++++++++++++++++------------- prometheus_client/multiprocess.py | 4 +-- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/prometheus_client/mmap_dict.py b/prometheus_client/mmap_dict.py index 3327adcd..d56923ee 100644 --- a/prometheus_client/mmap_dict.py +++ b/prometheus_client/mmap_dict.py @@ -22,6 +22,29 @@ def _pack_integer(data, pos, value): data[pos:pos + 4] = _pack_integer_func(value) +def _read_all_values(data, used=0): + """Yield (key, value, pos). No locking is performed.""" + + if used <= 0: + # If not valid `used` value is passed in, read it from the file. + used = _unpack_integer(data, 0)[0] + + pos = 8 + + while pos < used: + encoded_len = _unpack_integer(data, pos)[0] + # check we are not reading beyond bounds + if encoded_len + pos > used: + raise RuntimeError('Read beyond file size detected, file is corrupted.') + pos += 4 + encoded_key = data[pos : pos + encoded_len] + padded_len = encoded_len + (8 - (encoded_len + 4) % 8) + pos += padded_len + value = _unpack_double(data, pos)[0] + yield encoded_key.decode('utf-8'), value, pos + pos += 8 + + class MmapedDict(object): """A dict of doubles, backed by an mmapped file. @@ -55,6 +78,12 @@ def __init__(self, filename, read_mode=False): for key, _, pos in self._read_all_values(): self._positions[key] = pos + @staticmethod + def read_all_values_from_file(filename): + with open(filename, 'rb') as infp: + data = infp.read() + return _read_all_values(data) + def _init_value(self, key): """Initialize a value. Lock must be held by caller.""" encoded = key.encode('utf-8') @@ -74,30 +103,10 @@ def _init_value(self, key): def _read_all_values(self): """Yield (key, value, pos). No locking is performed.""" - - pos = 8 - - # cache variables to local ones and prevent attributes lookup - # on every loop iteration - used = self._used - data = self._m - - while pos < used: - encoded_len = _unpack_integer(data, pos)[0] - # check we are not reading beyond bounds - if encoded_len + pos > used: - msg = 'Read beyond file size detected, %s is corrupted.' - raise RuntimeError(msg % self._fname) - pos += 4 - encoded_key = data[pos : pos + encoded_len] - padded_len = encoded_len + (8 - (encoded_len + 4) % 8) - pos += padded_len - value = _unpack_double(data, pos)[0] - yield encoded_key.decode('utf-8'), value, pos - pos += 8 + return _read_all_values(data=self._m, used=self._used) def read_all_values(self): - """Yield (key, value, pos). No locking is performed.""" + """Yield (key, value). No locking is performed.""" for k, v, _ in self._read_all_values(): yield k, v diff --git a/prometheus_client/multiprocess.py b/prometheus_client/multiprocess.py index e62f9fcd..7676f251 100644 --- a/prometheus_client/multiprocess.py +++ b/prometheus_client/multiprocess.py @@ -54,8 +54,7 @@ def _parse_key(key): for f in files: parts = os.path.basename(f).split('_') typ = parts[0] - d = MmapedDict(f, read_mode=True) - for key, value in d.read_all_values(): + for key, value, pos in MmapedDict.read_all_values_from_file(f): metric_name, name, labels, labels_key = _parse_key(key) metric = metrics.get(metric_name) @@ -70,7 +69,6 @@ def _parse_key(key): else: # The duplicates and labels are fixed in the next for. metric.add_sample(name, labels_key, value) - d.close() return metrics @staticmethod From 2a56d5c6f4425d5afa345681b4afc972c4517cac Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 4 Jun 2019 16:01:07 +0300 Subject: [PATCH 6/8] Construct less tuples and dicts in accumulate Signed-off-by: Aarni Koskela --- prometheus_client/multiprocess.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/prometheus_client/multiprocess.py b/prometheus_client/multiprocess.py index 7676f251..9d351231 100644 --- a/prometheus_client/multiprocess.py +++ b/prometheus_client/multiprocess.py @@ -75,21 +75,22 @@ def _parse_key(key): def _accumulate_metrics(metrics, accumulate): for metric in metrics.values(): samples = defaultdict(float) - buckets = {} + buckets = defaultdict(lambda: defaultdict(float)) + samples_setdefault = samples.setdefault for s in metric.samples: - name, labels, value = s.name, s.labels, s.value + name, labels, value, timestamp, exemplar = s if metric.type == 'gauge': - without_pid = tuple(l for l in labels if l[0] != 'pid') + without_pid_key = (name, tuple(l for l in labels if l[0] != 'pid')) if metric._multiprocess_mode == 'min': - current = samples.setdefault((name, without_pid), value) + current = samples_setdefault(without_pid_key, value) if value < current: - samples[(s.name, without_pid)] = value + samples[without_pid_key] = value elif metric._multiprocess_mode == 'max': - current = samples.setdefault((name, without_pid), value) + current = samples_setdefault(without_pid_key, value) if value > current: - samples[(s.name, without_pid)] = value + samples[without_pid_key] = value elif metric._multiprocess_mode == 'livesum': - samples[(name, without_pid)] += value + samples[without_pid_key] += value else: # all/liveall samples[(name, labels)] = value @@ -98,16 +99,14 @@ def _accumulate_metrics(metrics, accumulate): if bucket: # _bucket without_le = tuple(l for l in labels if l[0] != 'le') - buckets.setdefault(without_le, {}) - buckets[without_le].setdefault(bucket[0], 0.0) buckets[without_le][bucket[0]] += value else: # _sum/_count - samples[(s.name, labels)] += value + samples[(name, labels)] += value else: # Counter and Summary. - samples[(s.name, labels)] += value + samples[(name, labels)] += value # Accumulate bucket values. if metric.type == 'histogram': From 0cfb054a24a60a132a63a034d8eb058e41f77bef Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 4 Jun 2019 17:31:03 +0300 Subject: [PATCH 7/8] Use less genexprs in multiprocess accumulate Signed-off-by: Aarni Koskela --- prometheus_client/multiprocess.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/prometheus_client/multiprocess.py b/prometheus_client/multiprocess.py index 9d351231..16547c1a 100644 --- a/prometheus_client/multiprocess.py +++ b/prometheus_client/multiprocess.py @@ -80,7 +80,7 @@ def _accumulate_metrics(metrics, accumulate): for s in metric.samples: name, labels, value, timestamp, exemplar = s if metric.type == 'gauge': - without_pid_key = (name, tuple(l for l in labels if l[0] != 'pid')) + without_pid_key = (name, tuple([l for l in labels if l[0] != 'pid'])) if metric._multiprocess_mode == 'min': current = samples_setdefault(without_pid_key, value) if value < current: @@ -95,15 +95,18 @@ def _accumulate_metrics(metrics, accumulate): samples[(name, labels)] = value elif metric.type == 'histogram': - bucket = tuple(float(l[1]) for l in labels if l[0] == 'le') - if bucket: - # _bucket - without_le = tuple(l for l in labels if l[0] != 'le') - buckets[without_le][bucket[0]] += value - else: + # A for loop with early exit is faster than a genexpr + # or a listcomp that ends up building unnecessary things + for l in labels: + if l[0] == 'le': + bucket_value = float(l[1]) + # _bucket + without_le = tuple(l for l in labels if l[0] != 'le') + buckets[without_le][bucket_value] += value + break + else: # did not find the `le` key # _sum/_count samples[(name, labels)] += value - else: # Counter and Summary. samples[(name, labels)] += value From 0f544ebdd52e346f468f61ce77556e7044a6f8ec Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 5 Jun 2019 09:31:29 +0300 Subject: [PATCH 8/8] Read only `used` bytes from MmapedDict files, not all the zeroes too Signed-off-by: Aarni Koskela --- prometheus_client/mmap_dict.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/prometheus_client/mmap_dict.py b/prometheus_client/mmap_dict.py index d56923ee..274b7389 100644 --- a/prometheus_client/mmap_dict.py +++ b/prometheus_client/mmap_dict.py @@ -81,8 +81,13 @@ def __init__(self, filename, read_mode=False): @staticmethod def read_all_values_from_file(filename): with open(filename, 'rb') as infp: - data = infp.read() - return _read_all_values(data) + # Read the first block of data, including the first 4 bytes which tell us + # how much of the file (which is preallocated to _INITIAL_MMAP_SIZE bytes) is occupied. + data = infp.read(65535) + used = _unpack_integer(data, 0)[0] + if used > len(data): # Then read in the rest, if needed. + data += infp.read(used - len(data)) + return _read_all_values(data, used) def _init_value(self, key): """Initialize a value. Lock must be held by caller."""