From 3e71b32185d37e8ce77f34371f362acd70d68d1f Mon Sep 17 00:00:00 2001
From: Georgi Petrov <32372905+G-D-Petrov@users.noreply.github.com>
Date: Mon, 8 Sep 2025 16:07:22 +0300
Subject: [PATCH 01/16] Add Pythagorean Won Loss Formula Notebook to docs
---
docs/mkdocs/mkdocs.yml | 1 +
1 file changed, 1 insertion(+)
diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml
index b603a03d66..1f99b6217e 100644
--- a/docs/mkdocs/mkdocs.yml
+++ b/docs/mkdocs/mkdocs.yml
@@ -129,6 +129,7 @@ nav:
- Equity Analytics Notebook: 'notebooks/ArcticDB_demo_equity_analytics.ipynb'
- Equity Options Notebook: 'notebooks/ArcticDB_demo_equity_options.ipynb'
- 1 Billion Row Challenge Notebook: 'notebooks/ArcticDB_billion_row_challenge.ipynb'
+ - Pythagorean Won Loss Formula Notebook: 'notebooks/ArcticDB_pythagorean_won_loss_formula_notebook.ipynb'
- Python API Reference:
- Introduction: 'api/index.md'
- Arctic: 'api/arctic.md'
From 68603dc9e7693a859e02bb88ed7e63a04527affe Mon Sep 17 00:00:00 2001
From: Georgi Petrov <32372905+G-D-Petrov@users.noreply.github.com>
Date: Wed, 10 Sep 2025 14:32:45 +0100
Subject: [PATCH 02/16] Upgrade node version in CI to 24 (#2636)
#### Reference Issues/PRs
#### What does this implement or fix?
- Azure CI problems as seen here -
https://github.com/man-group/ArcticDB/actions/runs/17551831701/job/49846790143
-- This is fixed by upgrading the Node version and fixing how ports are
provisioned for the Azurite sum
- Skips the problematic Linux Conda test for now -
https://github.com/man-group/ArcticDB/actions/runs/17551831697/job/49845948723
#### Any other comments?
#### Checklist
Checklist for code changes...
- [ ] Have you updated the relevant docstrings, documentation and
copyright notice?
- [ ] Is this contribution tested against [all ArcticDB's
features](../docs/mkdocs/docs/technical/contributing.md)?
- [ ] Do all exceptions introduced raise appropriate [error
messages](https://docs.arcticdb.io/error_messages/)?
- [ ] Are API changes highlighted in the PR description?
- [ ] Is the PR labelled as enhancement or bug so it appears in
autogenerated release notes?
---
.github/workflows/build_steps.yml | 3 +-
.github/workflows/build_with_conda.yml | 8 +-
python/arcticdb/storage_fixtures/azure.py | 104 +++++++++++--------
python/arcticdb/storage_fixtures/utils.py | 22 ++--
python/tests/integration/arcticdb/test_s3.py | 2 +
5 files changed, 85 insertions(+), 54 deletions(-)
diff --git a/.github/workflows/build_steps.yml b/.github/workflows/build_steps.yml
index 97964e810d..c3443c7d1b 100644
--- a/.github/workflows/build_steps.yml
+++ b/.github/workflows/build_steps.yml
@@ -70,6 +70,7 @@ jobs:
python_impl_name: ${{inputs.python3 > 0 && format('cp3{0}', inputs.python3) || 'default'}}
CIBW_BUILD: ${{format('cp3{0}-{1}', inputs.python3, matrix.cibw_build_suffix)}}
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+ NODE_OPTIONS: --openssl-legacy-provider
defaults:
run: {shell: bash}
steps:
@@ -375,7 +376,7 @@ jobs:
if: matrix.os == 'linux' || matrix.os == 'macos'
uses: actions/setup-node@v3.3.0
with:
- node-version: '16'
+ node-version: '24'
- name: Install Azurite
uses: nick-fields/retry@v3
diff --git a/.github/workflows/build_with_conda.yml b/.github/workflows/build_with_conda.yml
index d59c5b30c0..a1fe5d24c6 100644
--- a/.github/workflows/build_with_conda.yml
+++ b/.github/workflows/build_with_conda.yml
@@ -122,7 +122,7 @@ jobs:
- name: Install npm # Linux github runner image does not come with npm
uses: actions/setup-node@v3.3.0
with:
- node-version: '16'
+ node-version: '24'
- name: Install Azurite
uses: nick-fields/retry@v3
@@ -180,6 +180,7 @@ jobs:
# Use the Mongo created in the service container above to test against
CI_MONGO_HOST: mongodb
ARCTICDB_PYTEST_ARGS: ${{ inputs.run_custom_pytest_command }}
+ NODE_OPTIONS: --openssl-legacy-provider
macos:
@@ -236,7 +237,7 @@ jobs:
- name: Install npm
uses: actions/setup-node@v3.3.0
with:
- node-version: '16'
+ node-version: '24'
- name: Install Azurite
uses: nick-fields/retry@v3
@@ -275,7 +276,7 @@ jobs:
if [[ "$(echo "$ARCTICDB_PYTEST_ARGS" | xargs)" == pytest* ]]; then
command="python -m $ARCTICDB_PYTEST_ARGS"
echo "Run custom pytest command: $command"
- python -m pip install --retries 3 --timeout 180 pytest-repeat
+ python -m pip install --retries 3 --timeout 240 pytest-repeat
echo "Run custom pytest command: $ARCTICDB_PYTEST_ARGS"
eval "$command"
else
@@ -286,4 +287,5 @@ jobs:
ARCTICDB_USING_CONDA: 1
COMMANDLINE: ${{ inputs.run_commandline }}
ARCTICDB_PYTEST_ARGS: ${{ inputs.run_custom_pytest_command }}
+ NODE_OPTIONS: --openssl-legacy-provider
diff --git a/python/arcticdb/storage_fixtures/azure.py b/python/arcticdb/storage_fixtures/azure.py
index 8ffa0c3def..15f2d1c646 100644
--- a/python/arcticdb/storage_fixtures/azure.py
+++ b/python/arcticdb/storage_fixtures/azure.py
@@ -21,7 +21,14 @@
from azure.core.exceptions import ResourceNotFoundError
from .api import *
-from .utils import _LINUX, get_ephemeral_port, GracefulProcessUtils, wait_for_server_to_come_up, safer_rmtree, get_ca_cert_for_testing
+from .utils import (
+ _LINUX,
+ get_ephemeral_port,
+ GracefulProcessUtils,
+ wait_for_server_to_come_up,
+ safer_rmtree,
+ get_ca_cert_for_testing,
+)
from arcticc.pb2.storage_pb2 import EnvironmentConfigsMap
from arcticdb.version_store.helper import add_azure_library_to_env
@@ -46,12 +53,14 @@ class AzureContainer(StorageFixture):
def _get_policy(self) -> str:
from azure.storage.blob import LinearRetry
+
# The retry_policy instance will be modified by the pipeline, so cannot be constant
return {
- "connection_timeout": 1,
- "read_timeout": 2,
- "retry_policy": LinearRetry(retry_total=3, backoff=1),
- "connection_verify": self.factory.client_cert_file}
+ "connection_timeout": 1,
+ "read_timeout": 2,
+ "retry_policy": LinearRetry(retry_total=3, backoff=1),
+ "connection_verify": self.factory.client_cert_file,
+ }
def _set_uri_and_client_azurite(self, auth: str):
from azure.storage.blob import ContainerClient
@@ -66,7 +75,6 @@ def _set_uri_and_client_azurite(self, auth: str):
self.client = ContainerClient.from_connection_string(self.arctic_uri, self.container, **self._get_policy())
# add connection_verify=False to bypass ssl checking
-
def __init__(self, factory: Union["AzuriteStorageFixtureFactory", "AzureStorageFixtureFactory"]) -> None:
from azure.storage.blob import ContainerClient
@@ -170,9 +178,12 @@ class AzuriteStorageFixtureFactory(StorageFixtureFactory):
default_prefix: str = None
- def __init__(self, port=0, working_dir: Optional[str] = None, use_ssl: bool = True, ssl_test_support: bool = True):
+ def __init__(
+ self, port=None, working_dir: Optional[str] = None, use_ssl: bool = True, ssl_test_support: bool = True
+ ):
self.http_protocol = "https" if use_ssl else "http"
- self.port = port or get_ephemeral_port(1)
+ seed = 1 if use_ssl else 10
+ self.port = port or get_ephemeral_port(seed)
self.endpoint_root = f"{self.http_protocol}://{self.host}:{self.port}"
self.working_dir = str(working_dir) if working_dir else mkdtemp(suffix="AzuriteStorageFixtureFactory")
self.ssl_test_support = ssl_test_support
@@ -184,7 +195,9 @@ def _safe_enter(self):
args = f"{shutil.which('azurite')} --blobPort {self.port} --blobHost {self.host} --queuePort 0 --tablePort 0 --skipApiVersionCheck --silent"
if self.ssl_test_support:
self.client_cert_dir = self.working_dir
- self.ca, self.key_file, self.cert_file, self.client_cert_file = get_ca_cert_for_testing(self.client_cert_dir)
+ self.ca, self.key_file, self.cert_file, self.client_cert_file = get_ca_cert_for_testing(
+ self.client_cert_dir
+ )
else:
self.ca = ""
self.key_file = ""
@@ -193,10 +206,14 @@ def _safe_enter(self):
self.client_cert_dir = ""
if self.http_protocol == "https":
args += f" --key {self.key_file} --cert {self.cert_file}"
- self._p = GracefulProcessUtils.start_with_retry(url=self.endpoint_root,
- service_name="azurite", num_retries=2, timeout=240,
- process_start_cmd=args,
- cwd=self.working_dir)
+ self._p = GracefulProcessUtils.start_with_retry(
+ url=self.endpoint_root,
+ service_name="azurite",
+ num_retries=2,
+ timeout=240,
+ process_start_cmd=args,
+ cwd=self.working_dir,
+ )
return self
def __exit__(self, exc_type, exc_value, traceback):
@@ -208,7 +225,6 @@ def create_fixture(self) -> AzureContainer:
return AzureContainer(self)
def cleanup_container(self, b: AzureContainer):
-
def delete_container_safely(client, timeout):
try:
client.delete_container(timeout=timeout)
@@ -224,19 +240,19 @@ def delete_container_safely(client, timeout):
b._admin_client.close()
else:
delete_container_safely(b.client, timeout=3)
-
+
def find_ca_certs():
# Common CA certificates locations
default_paths = ssl.get_default_verify_paths()
- possible_paths = [
+ possible_paths = [
default_paths.cafile,
default_paths.openssl_cafile_env,
default_paths.openssl_cafile,
- '/etc/ssl/certs/ca-certificates.crt',
- '/usr/lib/ssl/certs/ca-certificates.crt',
- '/etc/pki/tls/certs/ca-bundle.crt',
- '/etc/ssl/cert.pem'
+ "/etc/ssl/certs/ca-certificates.crt",
+ "/usr/lib/ssl/certs/ca-certificates.crt",
+ "/etc/pki/tls/certs/ca-bundle.crt",
+ "/etc/ssl/cert.pem",
]
for path in possible_paths:
if path and os.path.isfile(path):
@@ -253,21 +269,18 @@ def copy_ca_certs(source_path: str, new_filename: str) -> str:
temp_dir = tempfile.gettempdir()
destination_path = os.path.join(temp_dir, new_filename)
shutil.copy2(source_path, destination_path)
- os.chmod(destination_path, stat.S_IRUSR | stat.S_IWUSR |
- stat.S_IRGRP | stat.S_IWGRP |
- stat.S_IROTH | stat.S_IWOTH)
+ os.chmod(destination_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH)
return destination_path
class AzureStorageFixtureFactory(StorageFixtureFactory):
-
endpoint: str
- account_name : str
- account_key : str
- connection_string : str = None
+ account_name: str
+ account_key: str
+ connection_string: str = None
default_container: str = None
default_prefix: Optional[str] = None
- client_cert_file : str = None
+ client_cert_file: str = None
protocol: str = None
clean_bucket_on_fixture_exit = True
@@ -291,19 +304,25 @@ def __exit__(self, exc_type, exc_value, traceback):
def __str__(self):
return f"[{type(self)}=Container:{self.default_container}], ConnectionString:{self.connection_string}"
- def initialize_from_connection_sting(self, constr: str, container: str, prefix: str = None) -> "AzureStorageFixtureFactory":
+ def initialize_from_connection_sting(
+ self, constr: str, container: str, prefix: str = None
+ ) -> "AzureStorageFixtureFactory":
def extract_from_regex(re_expr: str, constr: str) -> str:
match = re.search(re_expr, constr)
return match.group(1) if match else ""
- if constr is None: get_logger().error(f"Azure connection string not available: {constr}")
- if container is None: get_logger().error(f"Azure container not available: {container}")
+ if constr is None:
+ get_logger().error(f"Azure connection string not available: {constr}")
+ if container is None:
+ get_logger().error(f"Azure container not available: {container}")
AzureStorageFixtureFactory.connection_string = constr
- AzureStorageFixtureFactory.account_name = extract_from_regex(r'AccountName=([^;]+)', constr)
- AzureStorageFixtureFactory.account_key = extract_from_regex(r'AccountKey=([^;]+)', constr)
- AzureStorageFixtureFactory.protocol = extract_from_regex(r'DefaultEndpointsProtocol=([^;]+)', constr)
- endpoint_suffix = extract_from_regex(r'EndpointSuffix=([^;]+)', constr)
- AzureStorageFixtureFactory.endpoint = f"{AzureStorageFixtureFactory.protocol}://{AzureStorageFixtureFactory.account_name}.blob.{endpoint_suffix}"
+ AzureStorageFixtureFactory.account_name = extract_from_regex(r"AccountName=([^;]+)", constr)
+ AzureStorageFixtureFactory.account_key = extract_from_regex(r"AccountKey=([^;]+)", constr)
+ AzureStorageFixtureFactory.protocol = extract_from_regex(r"DefaultEndpointsProtocol=([^;]+)", constr)
+ endpoint_suffix = extract_from_regex(r"EndpointSuffix=([^;]+)", constr)
+ AzureStorageFixtureFactory.endpoint = (
+ f"{AzureStorageFixtureFactory.protocol}://{AzureStorageFixtureFactory.account_name}.blob.{endpoint_suffix}"
+ )
AzureStorageFixtureFactory.default_container = container
if prefix:
AzureStorageFixtureFactory.default_prefix = prefix
@@ -312,7 +331,7 @@ def extract_from_regex(re_expr: str, constr: str) -> str:
def get_arctic_uri(self):
url = f"azure://Container={self.default_container};Path_prefix={self.default_prefix}"
if self.client_cert_file:
- url += f";CA_cert_path={self.client_cert_file}"
+ url += f";CA_cert_path={self.client_cert_file}"
if self.connection_string:
url += f";{self.connection_string}"
else:
@@ -323,10 +342,10 @@ def create_fixture(self) -> AzureContainer:
return AzureContainer(self)
def cleanup_container(self, b: AzureContainer):
- b.slow_cleanup(failure_consequence="The following delete bucket call will also fail. ")
+ b.slow_cleanup(failure_consequence="The following delete bucket call will also fail. ")
if len(b.libs_from_factory) > 0:
- get_logger().warning(f"Libraries not cleared remaining {b.libs_from_factory.keys()}")
-
+ get_logger().warning(f"Libraries not cleared remaining {b.libs_from_factory.keys()}")
+
def real_azure_from_environment_variables(
shared_path: bool, native_config: Optional[NativeVariantStorage] = None, additional_suffix: str = ""
@@ -338,6 +357,7 @@ def real_azure_from_environment_variables(
prefix = os.getenv("ARCTICDB_PERSISTENT_STORAGE_UNIQUE_PATH_PREFIX", "") + additional_suffix
out.initialize_from_connection_sting(
constr=os.getenv("ARCTICDB_REAL_AZURE_CONNECTION_STRING"),
- container=os.getenv("ARCTICDB_REAL_AZURE_CONTAINER"),
- prefix=prefix)
+ container=os.getenv("ARCTICDB_REAL_AZURE_CONTAINER"),
+ prefix=prefix,
+ )
return out
diff --git a/python/arcticdb/storage_fixtures/utils.py b/python/arcticdb/storage_fixtures/utils.py
index b1dba4130a..bad867ed37 100644
--- a/python/arcticdb/storage_fixtures/utils.py
+++ b/python/arcticdb/storage_fixtures/utils.py
@@ -29,6 +29,10 @@
_LINUX = sys.platform.lower().startswith("linux")
_DEBUG = os.getenv("ACTIONS_RUNNER_DEBUG", default=None) in (1, "True")
+import logging
+
+logger = logging.getLogger("Utils")
+
def get_ephemeral_port(seed=0):
# Some OS has a tendency to reuse a port number that has just been closed, so if we use the trick from
@@ -39,7 +43,9 @@ def get_ephemeral_port(seed=0):
while port < 65535:
try:
with socketserver.TCPServer(("localhost", port), None):
- time.sleep(30 if ARCTICDB_USING_CONDA else 20) # Hold the port open for a while to improve the chance of collision detection
+ time.sleep(
+ 30 if ARCTICDB_USING_CONDA else 20
+ ) # Hold the port open for a while to improve the chance of collision detection
return port
except OSError as e:
print(repr(e), file=sys.stderr)
@@ -61,19 +67,19 @@ def start(cmd, **kwargs):
print("About to run:", cmd)
creation_flags = subprocess.CREATE_NEW_PROCESS_GROUP if _WINDOWS else 0
return subprocess.Popen(cmd, creationflags=creation_flags, **kwargs)
-
+
@staticmethod
- def start_with_retry(url: str, service_name: str, num_retries: int, timeout: int,
- process_start_cmd: str, **kwargs):
+ def start_with_retry(url: str, service_name: str, num_retries: int, timeout: int, process_start_cmd: str, **kwargs):
"""Attempts to start the process up to specified times.
-
+
Each time will wait for service to be avil at specified url up to the specified timeout"""
- for i in range(num_retries): # retry in case of connection problems
+ for i in range(num_retries): # retry in case of connection problems
try:
p = GracefulProcessUtils.start(process_start_cmd, **kwargs)
wait_for_server_to_come_up(url, service_name, p, timeout=timeout)
return p
- except AssertionError:
+ except AssertionError as ex:
+ logger.error(ex)
try:
p.terminate()
except:
@@ -177,4 +183,4 @@ def get_ca_cert_for_testing(working_dir):
cwd=working_dir,
shell=True,
)
- return ca, key_file, cert_file, client_cert_file # Need to keep ca alive to authenticate the cert
+ return ca, key_file, cert_file, client_cert_file # Need to keep ca alive to authenticate the cert
diff --git a/python/tests/integration/arcticdb/test_s3.py b/python/tests/integration/arcticdb/test_s3.py
index 7986a8de2b..c575f3bd19 100644
--- a/python/tests/integration/arcticdb/test_s3.py
+++ b/python/tests/integration/arcticdb/test_s3.py
@@ -24,6 +24,7 @@
from arcticdb.storage_fixtures.s3 import MotoS3StorageFixtureFactory
from arcticdb.util.test import config_context, config_context_string
+from tests.util.mark import SKIP_CONDA_MARK
pytestmark = pytest.mark.skipif(
sys.version_info.major == 3 and sys.version_info.minor == 6 and sys.platform == "linux",
@@ -195,6 +196,7 @@ def test_wrapped_s3_storage(lib_name, wrapped_s3_storage_bucket):
lib.write("s", data=create_df())
+@SKIP_CONDA_MARK # issue with fixture init will be fixed in https://github.com/man-group/ArcticDB/issues/2640
def test_library_get_key_path(lib_name, s3_and_nfs_storage_bucket, test_prefix):
lib = s3_and_nfs_storage_bucket.create_version_store_factory(lib_name)()
lib.write("s", data=create_df())
From 1b13f924d38c19ab381b99ada20d4d1e175e2e14 Mon Sep 17 00:00:00 2001
From: IvoDD
Date: Mon, 15 Sep 2025 09:44:51 +0300
Subject: [PATCH 03/16] [9898131742] Fix arrow projection with dynamic schema
(#2630)
#### Reference Issues/PRs
Monday ref: 9898131742
#### What does this implement or fix?
This PR modifies `NullReducer` code to not rely on the slice index and
by preserving a `column_block_offset_` state avoids an unneeded `log(n)`
search for the offset.
#### Any other comments?
`NullReducer` code was assuming that `len(slice_and_keys) =
len(row_slices_per_column)` when using `dynamic_schema=True`. That is
not true if we use projections.
E.g. for the following projection our slicing would look like:
```
Given:
TD key 1:
index A
1 1
2 2
TD key 2:
index A B
3 3 1
4 4 2
TD key 3:
index B
5 3
6 4
And we do a projection like `q.apply("C", q["A"] + q["B"])` our slicing would look
like:
Slice 1: TD key 1
Slice 2: TD key 2
Slice 3:
index C
3 4
4 6
Slice 4: TD key 3
```
#### Checklist
Checklist for code changes...
- [ ] Have you updated the relevant docstrings, documentation and
copyright notice?
- [ ] Is this contribution tested against [all ArcticDB's
features](../docs/mkdocs/docs/technical/contributing.md)?
- [ ] Do all exceptions introduced raise appropriate [error
messages](https://docs.arcticdb.io/error_messages/)?
- [ ] Are API changes highlighted in the PR description?
- [ ] Is the PR labelled as enhancement or bug so it appears in
autogenerated release notes?
---
cpp/arcticdb/pipeline/read_frame.cpp | 35 ++++++++-------
cpp/arcticdb/version/version_core.cpp | 3 ++
python/arcticdb/util/test.py | 16 +++++--
.../unit/arcticdb/version_store/test_arrow.py | 44 ++++++++++++++++++-
4 files changed, 78 insertions(+), 20 deletions(-)
diff --git a/cpp/arcticdb/pipeline/read_frame.cpp b/cpp/arcticdb/pipeline/read_frame.cpp
index ecca2f686e..a221753fb3 100644
--- a/cpp/arcticdb/pipeline/read_frame.cpp
+++ b/cpp/arcticdb/pipeline/read_frame.cpp
@@ -732,6 +732,7 @@ class NullValueReducer {
std::shared_ptr context_;
SegmentInMemory frame_;
size_t pos_;
+ size_t column_block_idx_;
DecodePathData shared_data_;
std::any& handler_data_;
const OutputFormat output_format_;
@@ -751,6 +752,7 @@ class NullValueReducer {
context_(context),
frame_(std::move(frame)),
pos_(frame_.offset()),
+ column_block_idx_(0),
shared_data_(std::move(shared_data)),
handler_data_(handler_data),
output_format_(output_format),
@@ -761,18 +763,17 @@ class NullValueReducer {
return context_row.slice_and_key().slice_.row_range.first;
}
- void backfill_all_zero_validity_bitmaps(size_t offset_bytes_start, size_t offset_bytes_end_idx) {
- // Explanation: offset_bytes_start and offset_bytes_end should both be elements of block_offsets by
- // construction. We must add an all zeros validity bitmap for each row-slice read from storage where this
- // column was missing, in order to correctly populate the Arrow record-batches for the output
+ void backfill_all_zero_validity_bitmaps_up_to(std::optional up_to_block_offset) {
+ // Fills up all validity bitmaps with zeros from `column_block_idx_` until reaching `up_to_block_offset`.
+ // If `up_to_block_offset` is `std::nullopt` then fills up until the end of the column.
const auto& block_offsets = column_.block_offsets();
- auto start_it = std::ranges::lower_bound(block_offsets, offset_bytes_start);
- util::check(start_it != block_offsets.cend() && *start_it == offset_bytes_start,
- "NullValueReducer: Failed to find offset_bytes_start {} in block_offsets {}",
- offset_bytes_start, block_offsets);
- for (auto idx = static_cast(std::distance(block_offsets.begin(), start_it)); idx < offset_bytes_end_idx; ++idx) {
- auto rows = (block_offsets.at(idx + 1) - block_offsets.at(idx)) / type_bytes_;
- create_dense_bitmap_all_zeros(block_offsets.at(idx), rows, column_, AllocationType::DETACHABLE);
+ util::check(!up_to_block_offset.has_value() || up_to_block_offset.value() <= block_offsets.back(), "up_to_block_offset outside of range");
+ for (; column_block_idx_ < block_offsets.size() - 1; ++column_block_idx_) {
+ if (up_to_block_offset.has_value() && block_offsets.at(column_block_idx_) >= up_to_block_offset.value()) {
+ break;
+ }
+ auto rows = (block_offsets.at(column_block_idx_ + 1) - block_offsets.at(column_block_idx_)) / type_bytes_;
+ create_dense_bitmap_all_zeros(block_offsets.at(column_block_idx_), rows, column_, AllocationType::DETACHABLE);
}
}
@@ -783,6 +784,7 @@ class NullValueReducer {
if (current_pos != pos_) {
const auto num_rows = current_pos - pos_;
const auto start_row = pos_ - frame_.offset();
+ const auto end_row = current_pos - frame_.offset();
if (const std::shared_ptr& handler = get_type_handler(output_format_, column_.type()); handler) {
handler->default_initialize(column_.buffer(), start_row * handler->type_size(), num_rows * handler->type_size(), shared_data_, handler_data_);
} else if (output_format_ != OutputFormat::ARROW) {
@@ -790,11 +792,12 @@ class NullValueReducer {
column_.default_initialize_rows(start_row, num_rows, false, default_value_);
}
if (output_format_ == OutputFormat::ARROW) {
- backfill_all_zero_validity_bitmaps(start_row * type_bytes_, context_row.index());
+ backfill_all_zero_validity_bitmaps_up_to(end_row * type_bytes_);
}
- pos_ = current_pos + sz_to_advance;
- } else {
- pos_ += sz_to_advance;
+ }
+ pos_ = current_pos + sz_to_advance;
+ if (output_format_ == OutputFormat::ARROW) {
+ ++column_block_idx_;
}
}
@@ -812,7 +815,7 @@ class NullValueReducer {
column_.default_initialize_rows(start_row, num_rows, false, default_value_);
}
if (output_format_ == OutputFormat::ARROW) {
- backfill_all_zero_validity_bitmaps(start_row * type_bytes_, column_.block_offsets().size() - 1);
+ backfill_all_zero_validity_bitmaps_up_to(std::nullopt);
}
}
}
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
index c274d8b754..a22a86c83b 100644
--- a/cpp/arcticdb/version/version_core.cpp
+++ b/cpp/arcticdb/version/version_core.cpp
@@ -1381,6 +1381,7 @@ void copy_frame_data_to_buffer(
const ColumnMapping mapping{src_column.type(), dst_column.type(), destination.field(target_index), type_size, num_rows, row_range.first, offset, total_size, target_index};
handler->convert_type(src_column, dst_column, mapping, shared_data, handler_data, source.string_pool_ptr());
} else if (is_empty_type(src_column.type().data_type())) {
+ // TODO: For arrow we want to set validity bitmaps instead of `initialize`ing
dst_column.type().visit_tag([&](auto dst_desc_tag) {
util::initialize(dst_ptr, total_size, default_value);
});
@@ -1389,6 +1390,7 @@ void copy_frame_data_to_buffer(
details::visit_type(dst_column.type().data_type(), [&](auto dst_tag) {
using dst_type_info = ScalarTypeInfo;
typename dst_type_info::RawType* typed_dst_ptr = reinterpret_cast(dst_ptr);
+ // TODO: For arrow we want to set validity bitmaps instead of `initialize`ing
util::initialize(dst_ptr, num_rows * dst_rawtype_size, default_value);
details::visit_type(src_column.type().data_type(), [&](auto src_tag) {
using src_type_info = ScalarTypeInfo;
@@ -1408,6 +1410,7 @@ void copy_frame_data_to_buffer(
dst_ptr += row_count * sizeof(SourceType);
}
} else {
+ // TODO: For arrow we want to set validity bitmaps instead of `initialize`ing
util::initialize(dst_ptr, num_rows * dst_rawtype_size, default_value);
SourceType* typed_dst_ptr = reinterpret_cast(dst_ptr);
Column::for_each_enumerated(src_column, [&](const auto& row) {
diff --git a/python/arcticdb/util/test.py b/python/arcticdb/util/test.py
index 53aeae6ee5..342f74ca1b 100644
--- a/python/arcticdb/util/test.py
+++ b/python/arcticdb/util/test.py
@@ -242,15 +242,25 @@ def assert_frame_equal_rebuild_index_first(expected: pd.DataFrame, actual: pd.Da
assert_frame_equal(left=expected, right=actual)
-def convert_arrow_to_pandas_and_remove_categoricals(table):
+def convert_arrow_to_pandas_for_tests(table):
+ """
+ Converts pa.Table outputted via `output_format=OutputFormat.EXPERIMENTAL_ARROW` to a pd.DataFrame so it would be
+ identical to the one outputted via `output_format=OutputFormat.PANDAS`. This requires two changes:
+ - Replaces dictionary encoded string columns with regular string columns.
+ - Fills null values in int colums with zeros.
+ """
new_table = stringify_dictionary_encoded_columns(table)
+ for i, name in enumerate(new_table.column_names):
+ if pa.types.is_integer(new_table.column(i).type):
+ new_col = new_table.column(i).fill_null(0)
+ new_table = new_table.set_column(i, name, new_col)
return new_table.to_pandas()
def assert_frame_equal_with_arrow(left, right, **kwargs):
if isinstance(left, pa.Table):
- left = convert_arrow_to_pandas_and_remove_categoricals(left)
+ left = convert_arrow_to_pandas_for_tests(left)
if isinstance(right, pa.Table):
- right = convert_arrow_to_pandas_and_remove_categoricals(right)
+ right = convert_arrow_to_pandas_for_tests(right)
assert_frame_equal(left, right, **kwargs)
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py
index 25fadde6f0..589c86da36 100644
--- a/python/tests/unit/arcticdb/version_store/test_arrow.py
+++ b/python/tests/unit/arcticdb/version_store/test_arrow.py
@@ -10,6 +10,7 @@
from arcticdb.version_store.processing import QueryBuilder
from arcticdb.options import OutputFormat
import pyarrow as pa
+import pyarrow.compute as pc
from arcticdb.util.hypothesis import (
use_of_function_scoped_fixtures_in_hypothesis_checked,
ENDIANNESS,
@@ -17,7 +18,7 @@
dataframe_strategy,
column_strategy,
)
-from arcticdb.util.test import get_sample_dataframe
+from arcticdb.util.test import get_sample_dataframe, make_dynamic
from arcticdb_ext.storage import KeyType
from tests.util.mark import WINDOWS
@@ -625,3 +626,44 @@ def test_arrow_dynamic_schema_filtered_column(lmdb_version_store_dynamic_schema_
q = q[q["col"] < 5]
received = stringify_dictionary_encoded_columns(lib.read(sym, query_builder=q).data)
assert expected.equals(received)
+
+
+def test_project_dynamic_schema(lmdb_version_store_dynamic_schema_v1):
+ lib = lmdb_version_store_dynamic_schema_v1
+ lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ sym = "sym"
+ table_1 = pa.table({"a": pa.array([1, 2])})
+ table_2 = pa.table({"a": pa.array([3, 4]), "b": pa.array([1, 2])})
+ table_3 = pa.table({"b": pa.array([3, 4])})
+ lib.write(sym, table_1.to_pandas())
+ lib.append(sym, table_2.to_pandas())
+ lib.append(sym, table_3.to_pandas())
+ q = QueryBuilder()
+ q = q.apply("c", q["a"] * q["b"] + 10)
+ received = lib.read(sym, query_builder=q).data
+ expected = pa.concat_tables([table_1, table_2, table_3], promote_options="permissive")
+ expected_new_col = pc.add(pc.multiply(expected.column("a"), expected.column("b")), 10)
+ expected = expected.append_column("c", expected_new_col)
+ assert expected.equals(received)
+
+
+def test_project_dynamic_schema_complex(lmdb_version_store_dynamic_schema_v1):
+ lib = lmdb_version_store_dynamic_schema_v1
+ lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ sym = "sym"
+ df = pd.DataFrame({
+ "int_col_1": np.arange(0, 10, dtype=np.int16),
+ "int_col_2": np.arange(10, 20, dtype=np.int32),
+ "float_col": np.arange(20, 30, dtype=np.float64),
+ })
+ expected, slices = make_dynamic(df)
+ for df_slice in slices:
+ lib.append(sym, df_slice, write_if_missing=True)
+
+ q = QueryBuilder()
+ q = q.apply("new_float_1", q["int_col_1"] / q["float_col"] + 1)
+ q = q.apply("new_float_2", q["int_col_2"] * q["new_float_1"])
+
+ table = lib.read(sym, query_builder=q).data
+ expected = lib.read(sym, query_builder=q, output_format=OutputFormat.PANDAS).data
+ assert_frame_equal_with_arrow(table, expected)
From a10ecd74a73f3e5947057fa229713cca47396413 Mon Sep 17 00:00:00 2001
From: grusev
Date: Mon, 15 Sep 2025 10:51:10 +0300
Subject: [PATCH 04/16] Only one version when running storage tests (#2643)
#### Reference Issues/PRs
#### What does this implement or fix?
Currently for storage tests we run all OS-Python combinations, although
storage tests are executed only with Python 3.11 on Win and Linux. That
creates a mix of results where majority 3.8-3.10, 3.12, 3.13 are LMDB
tests and only 3.11 are real storage tests. That creates lots of
confusion. The ideal solution would be to limit to 3.11 the runs so that
only results from real storage tests are in the runs. For example see
this link of a run of real_tetsts:
https://github.com/man-group/ArcticDB/actions/runs/17659474240. It has
so many LMDB runs and only 2 are Real Storages. Can you guess which
ones?
A run for GCPXML:
https://github.com/man-group/ArcticDB/actions/runs/17643448437
Note: Linux - 3.8 still exists, and that is because it uses
matrix-include which cannot be created with a condition. Still all
others are gone from the run
#### Any other comments?
#### Checklist
Checklist for code changes...
- [ ] Have you updated the relevant docstrings, documentation and
copyright notice?
- [ ] Is this contribution tested against [all ArcticDB's
features](../docs/mkdocs/docs/technical/contributing.md)?
- [ ] Do all exceptions introduced raise appropriate [error
messages](https://docs.arcticdb.io/error_messages/)?
- [ ] Are API changes highlighted in the PR description?
- [ ] Is the PR labelled as enhancement or bug so it appears in
autogenerated release notes?
---
.github/workflows/build.yml | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 3315cce89f..7ff0e3ce2f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -300,7 +300,8 @@ jobs:
strategy:
fail-fast: false
matrix:
- python3: ${{fromJson(vars.LINUX_PYTHON_VERSIONS || '[8, 9, 10, 11, 12, 13]')}}
+ # For storage runs that are not full matrix will have only one Python version
+ python3: ${{fromJson( !inputs.run_full_matrix_of_persistent_tests && (needs.storage_type.outputs.storage != 'no') && '[11]' || '[8, 9, 10, 11, 12, 13]')}}
include:
- python_deps_ids: [""]
matrix_override: ${{fromJson(needs.common_config.outputs.linux_matrix)}}
@@ -367,7 +368,8 @@ jobs:
strategy:
fail-fast: false
matrix:
- python3: ${{fromJson(vars.LINUX_PYTHON_VERSIONS || '[8, 9, 10, 11, 12, 13]')}}
+ # For storage runs that are not full matrix will have only one Python version
+ python3: ${{fromJson( !inputs.run_full_matrix_of_persistent_tests && (needs.storage_type.outputs.storage != 'no') && '[11]' || '[8, 9, 10, 11, 12, 13]')}}
include:
- matrix_override: ${{fromJson(needs.common_config.outputs.windows_matrix)}}
name: 3.${{matrix.python3}} Windows
@@ -460,7 +462,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- python3: ${{ fromJson(vars.LINUX_PYTHON_VERSIONS || '[8, 9, 10, 11]') }}
+ python3: ${{ fromJson( !inputs.run_full_matrix_of_persistent_tests && (needs.storage_type.outputs.storage != 'no') && '[11]' || '[8, 9, 10, 11]' ) }}
include:
- python_deps_ids: [""]
matrix_override: ${{fromJson(needs.common_config.outputs.macos_matrix)}}
From ffe58bc89c63962f615385eb3a02fd0a700f2951 Mon Sep 17 00:00:00 2001
From: IvoDD
Date: Mon, 15 Sep 2025 13:18:25 +0300
Subject: [PATCH 05/16] [9898177828] Use `default_value` in `NullReducer` for
`OutputFormat::ARROW` (#2633)
#### Reference Issues/PRs
Monday ref: 9898177828
#### What does this implement or fix?
When doing aggregation we explicitly default `sum=0` for slices with no
underlying values.
For arrow this means to not set the validity bitmap in this case and to
default initialize the values.
The change includes:
- Small refactor of `NullReducer` to extract common parts between
`reduce` and `finalize` in `backfill_up_to_frame_offset`
- Modification of `Column::default_initialize` to work across several
blocks
- Removes broken `memset` method from `ChunkedBuffer` and instead
provides a new `util::initialize` method which can initialize a
`ChunkedBuffer` across blocks
#### Any other comments?
#### Checklist
Checklist for code changes...
- [ ] Have you updated the relevant docstrings, documentation and
copyright notice?
- [ ] Is this contribution tested against [all ArcticDB's
features](../docs/mkdocs/docs/technical/contributing.md)?
- [ ] Do all exceptions introduced raise appropriate [error
messages](https://docs.arcticdb.io/error_messages/)?
- [ ] Are API changes highlighted in the PR description?
- [ ] Is the PR labelled as enhancement or bug so it appears in
autogenerated release notes?
---
cpp/arcticdb/column_store/chunked_buffer.hpp | 30 ++++++------
cpp/arcticdb/column_store/column.cpp | 4 +-
cpp/arcticdb/pipeline/read_frame.cpp | 48 +++++++------------
cpp/arcticdb/util/sparse_utils.hpp | 42 ++++------------
.../unit/arcticdb/version_store/test_arrow.py | 41 ++++++++++++++++
5 files changed, 84 insertions(+), 81 deletions(-)
diff --git a/cpp/arcticdb/column_store/chunked_buffer.hpp b/cpp/arcticdb/column_store/chunked_buffer.hpp
index d1fdb616e8..aecd32166b 100644
--- a/cpp/arcticdb/column_store/chunked_buffer.hpp
+++ b/cpp/arcticdb/column_store/chunked_buffer.hpp
@@ -311,7 +311,7 @@ class ChunkedBufferImpl {
uint8_t* bytes_at(size_t pos_bytes, size_t required) {
auto [block, pos, _] = block_and_offset(pos_bytes);
- util::check(pos + required <= block->bytes(), "Block overflow, position {} is greater than block capacity {}", pos, block->bytes());
+ util::check(pos + required <= block->bytes(), "Block overflow, position {} is greater than block capacity {}", pos + required, block->bytes());
return &(*block)[pos];
}
@@ -366,21 +366,21 @@ class ChunkedBufferImpl {
}
}
- void memset_buffer(size_t offset, size_t bytes, char value) {
- auto [block, pos, block_index] = block_and_offset(offset);
- while(bytes > 0) {
- const auto size_to_write = block->bytes() - pos;
- memset(block->data() + pos, size_to_write, value);
- bytes -= size_to_write;
- if(bytes > 0) {
- ++block_index;
- if(block_index == blocks_.size())
- return;
-
- block = blocks_[block_index];
- pos = 0;
- }
+ // Returns a vector of continuous buffers, each designated by a pointer and size
+ // Similar to `bytes_at` but will work if the requested range spans multiple continuous blocks.
+ std::vector> byte_blocks_at(size_t pos_bytes, size_t required_bytes) {
+ check_bytes(pos_bytes, required_bytes);
+ std::vector> result;
+ auto [block, pos, block_index] = block_and_offset(pos_bytes);
+ while(required_bytes > 0) {
+ block = blocks_[block_index];
+ const auto size_to_write = std::min(required_bytes, block->bytes() - pos);
+ result.push_back({block->data() + pos, size_to_write});
+ required_bytes -= size_to_write;
+ ++block_index;
+ pos = 0;
}
+ return result;
}
template
diff --git a/cpp/arcticdb/column_store/column.cpp b/cpp/arcticdb/column_store/column.cpp
index 502e4c3bc2..318a196888 100644
--- a/cpp/arcticdb/column_store/column.cpp
+++ b/cpp/arcticdb/column_store/column.cpp
@@ -670,9 +670,7 @@ void Column::default_initialize_rows(size_t start_pos, size_t num_rows, bool ens
if (ensure_alloc) {
data_.ensure(bytes);
}
- // This doesn't work if we default_initialize bytes which span across multiple blocks.
- auto type_ptr = reinterpret_cast(data_.bytes_at(start_pos * sizeof(RawType), bytes));
- util::initialize(reinterpret_cast(type_ptr), bytes, default_value);
+ util::initialize(data_.buffer(), start_pos * sizeof(RawType), bytes, default_value);
if (ensure_alloc) {
data_.commit();
}
diff --git a/cpp/arcticdb/pipeline/read_frame.cpp b/cpp/arcticdb/pipeline/read_frame.cpp
index a221753fb3..2a00740c65 100644
--- a/cpp/arcticdb/pipeline/read_frame.cpp
+++ b/cpp/arcticdb/pipeline/read_frame.cpp
@@ -763,38 +763,38 @@ class NullValueReducer {
return context_row.slice_and_key().slice_.row_range.first;
}
- void backfill_all_zero_validity_bitmaps_up_to(std::optional up_to_block_offset) {
+ void backfill_all_zero_validity_bitmaps_up_to(size_t up_to_block_offset) {
// Fills up all validity bitmaps with zeros from `column_block_idx_` until reaching `up_to_block_offset`.
- // If `up_to_block_offset` is `std::nullopt` then fills up until the end of the column.
const auto& block_offsets = column_.block_offsets();
- util::check(!up_to_block_offset.has_value() || up_to_block_offset.value() <= block_offsets.back(), "up_to_block_offset outside of range");
- for (; column_block_idx_ < block_offsets.size() - 1; ++column_block_idx_) {
- if (up_to_block_offset.has_value() && block_offsets.at(column_block_idx_) >= up_to_block_offset.value()) {
- break;
- }
+ util::check(up_to_block_offset <= block_offsets.back(), "up_to_block_offset {} outside of range {}", up_to_block_offset, block_offsets.back());
+ for (; column_block_idx_ < block_offsets.size() - 1 && block_offsets.at(column_block_idx_) < up_to_block_offset; ++column_block_idx_) {
auto rows = (block_offsets.at(column_block_idx_ + 1) - block_offsets.at(column_block_idx_)) / type_bytes_;
create_dense_bitmap_all_zeros(block_offsets.at(column_block_idx_), rows, column_, AllocationType::DETACHABLE);
}
}
- void reduce(PipelineContextRow &context_row){
- auto &slice_and_key = context_row.slice_and_key();
- auto sz_to_advance = slice_and_key.slice_.row_range.diff();
- auto current_pos = context_row.slice_and_key().slice_.row_range.first;
- if (current_pos != pos_) {
- const auto num_rows = current_pos - pos_;
+ void backfill_up_to_frame_offset(size_t up_to) {
+ if (pos_ != up_to) {
+ const auto num_rows = up_to - pos_;
const auto start_row = pos_ - frame_.offset();
- const auto end_row = current_pos - frame_.offset();
+ const auto end_row = up_to - frame_.offset();
if (const std::shared_ptr& handler = get_type_handler(output_format_, column_.type()); handler) {
handler->default_initialize(column_.buffer(), start_row * handler->type_size(), num_rows * handler->type_size(), shared_data_, handler_data_);
- } else if (output_format_ != OutputFormat::ARROW) {
+ } else if (output_format_ != OutputFormat::ARROW || default_value_.has_value()) {
// Arrow does not care what values are in the main buffer where the validity bitmap is zero
column_.default_initialize_rows(start_row, num_rows, false, default_value_);
}
- if (output_format_ == OutputFormat::ARROW) {
+ if (output_format_ == OutputFormat::ARROW && !default_value_.has_value()) {
backfill_all_zero_validity_bitmaps_up_to(end_row * type_bytes_);
}
}
+ }
+
+ void reduce(PipelineContextRow &context_row){
+ auto &slice_and_key = context_row.slice_and_key();
+ auto sz_to_advance = slice_and_key.slice_.row_range.diff();
+ auto current_pos = context_row.slice_and_key().slice_.row_range.first;
+ backfill_up_to_frame_offset(current_pos);
pos_ = current_pos + sz_to_advance;
if (output_format_ == OutputFormat::ARROW) {
++column_block_idx_;
@@ -804,20 +804,8 @@ class NullValueReducer {
void finalize() {
const auto total_rows = frame_.row_count();
const auto end = frame_.offset() + total_rows;
- if(pos_ != end) {
- util::check(pos_ < end, "Overflow in finalize {} > {}", pos_, end);
- const auto num_rows = end - pos_;
- const auto start_row = pos_ - frame_.offset();
- if (const std::shared_ptr& handler = get_type_handler(output_format_, column_.type()); handler) {
- handler->default_initialize(column_.buffer(), start_row * handler->type_size(), num_rows * handler->type_size(), shared_data_, handler_data_);
- } else if (output_format_ != OutputFormat::ARROW) {
- // Arrow does not care what values are in the main buffer where the validity bitmap is zero
- column_.default_initialize_rows(start_row, num_rows, false, default_value_);
- }
- if (output_format_ == OutputFormat::ARROW) {
- backfill_all_zero_validity_bitmaps_up_to(std::nullopt);
- }
- }
+ util::check(pos_ <= end, "Overflow in finalize {} > {}", pos_, end);
+ backfill_up_to_frame_offset(end);
}
};
diff --git a/cpp/arcticdb/util/sparse_utils.hpp b/cpp/arcticdb/util/sparse_utils.hpp
index e56e0bc3fc..fc75c12979 100644
--- a/cpp/arcticdb/util/sparse_utils.hpp
+++ b/cpp/arcticdb/util/sparse_utils.hpp
@@ -86,39 +86,6 @@ void default_initialize(uint8_t* data, const size_t bytes) {
}
}
-template
-requires util::instantiation_of
-void default_initialize(ChunkedBuffer& buffer, size_t offset, const size_t bytes, DecodePathData shared_data, std::any& handler_data) {
- using RawType = typename TagType::DataTypeTag::raw_type;
- const auto num_rows ARCTICDB_UNUSED = bytes / sizeof(RawType);
- constexpr auto type = static_cast(TagType{});
- constexpr auto data_type = type.data_type();
- ColumnData column_data{&buffer, type};
- auto pos = column_data.begin();
- std::advance(pos, offset);
- //auto end = column_data.begin();
- if constexpr (is_sequence_type(data_type)) {
- std::fill_n(pos, num_rows, not_a_string());
- } else if constexpr (is_floating_point_type(data_type)) {
- std::fill_n(pos, num_rows, std::numeric_limits::quiet_NaN());
- } else if constexpr (is_time_type(data_type)) {
- std::fill_n(pos, num_rows, NaT);
- } else if constexpr (is_integer_type(data_type) || is_bool_type(data_type)) {
- buffer.memset_buffer(offset, bytes, 0);
- } else {
- constexpr auto type_descriptor = TagType::type_descriptor();
- if (const std::shared_ptr& handler = arcticdb::TypeHandlerRegistry::instance()->get_handler(type_descriptor);handler) {
- handler->default_initialize(buffer, offset, bytes, shared_data, handler_data);
- } else {
- internal::raise(
- "Default initialization for {} is not implemented.",
- type_descriptor
- );
- }
- }
-}
-
-
/// Initialize a buffer either using a custom default value or using a predefined default value for the type
/// @param[in] default_value Variant holding either a value of the raw type for the type tag or std::monostate
template
@@ -137,6 +104,15 @@ void initialize(uint8_t* data, const size_t bytes, const std::optional& d
}
}
+template
+requires util::instantiation_of
+void initialize(ChunkedBuffer& buffer, size_t offset, size_t bytes, const std::optional& default_value) {
+ auto blocks = buffer.byte_blocks_at(offset, bytes);
+ for (auto [data, size] : blocks) {
+ initialize(data, size, default_value);
+ }
+}
+
[[nodiscard]] util::BitSet scan_object_type_to_sparse(
const PyObject* const* ptr,
size_t rows_to_write);
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow.py b/python/tests/unit/arcticdb/version_store/test_arrow.py
index 589c86da36..9e7afc5fee 100644
--- a/python/tests/unit/arcticdb/version_store/test_arrow.py
+++ b/python/tests/unit/arcticdb/version_store/test_arrow.py
@@ -667,3 +667,44 @@ def test_project_dynamic_schema_complex(lmdb_version_store_dynamic_schema_v1):
table = lib.read(sym, query_builder=q).data
expected = lib.read(sym, query_builder=q, output_format=OutputFormat.PANDAS).data
assert_frame_equal_with_arrow(table, expected)
+
+
+def test_aggregation_empty_slices(lmdb_version_store_dynamic_schema_v1):
+ lib = lmdb_version_store_dynamic_schema_v1
+ lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ sym = "sym"
+ df_1 = pd.DataFrame({
+ "group_col": [chr(ord("a")+i) for i in range(5)],
+ "mean_col": np.arange(0, 5, dtype=np.float64),
+ "sum_col": np.arange(0, 5, dtype=np.float64),
+ "min_col": np.arange(0, 5, dtype=np.float64),
+ "max_col": np.arange(0, 5, dtype=np.float64),
+ "count_col": np.arange(0, 5, dtype=np.float64),
+ })
+ df_2 = pd.DataFrame({
+ "group_col": [chr(ord("a")+i+10) for i in range(5)],
+ })
+ lib.write(sym, df_1, dynamic_strings=True)
+ lib.append(sym, df_2, dynamic_strings=True)
+
+ q = QueryBuilder()
+ q.groupby("group_col").agg({
+ "mean_col": "mean",
+ "sum_col": "sum",
+ "min_col": "min",
+ "max_col": "max",
+ "count_col": "count",
+ })
+
+ table = lib.read(sym, query_builder=q).data
+ # sum_col is correctly filled with 0s instead of nulls
+ assert pc.count(table.column("sum_col"), mode="only_null").as_py() == 0
+ # TODO: Fix the TODOs in `CopyToBufferTask` to make num_nulls=5 as expected
+ # For this test it so happens that one present and one missing value end up in the same bucket.
+ # Copying then default initializes the missing values instead of setting the validity bitmap.
+ # assert pc.count(table.column("mean_col"), mode="only_null").as_py() == 5
+ # assert pc.count(table.column("min_col"), mode="only_null").as_py() == 5
+ # assert pc.count(table.column("max_col"), mode="only_null").as_py() == 5
+ # assert pc.count(table.column("count_col"), mode="only_null").as_py() == 5
+ expected = lib.read(sym, query_builder=q, output_format=OutputFormat.PANDAS).data
+ assert_frame_equal_with_arrow(table, expected)
From 7acd347e3a035b635fd71c1bde2b54e0135ac8db Mon Sep 17 00:00:00 2001
From: Georgi Petrov <32372905+G-D-Petrov@users.noreply.github.com>
Date: Wed, 17 Sep 2025 09:32:47 +0100
Subject: [PATCH 06/16] Apply formatting rules (#2649)
#### Reference Issues/PRs
Monday ref: 10048929527
#### What does this implement or fix?
#### Any other comments?
#### Checklist
Checklist for code changes...
- [ ] Have you updated the relevant docstrings, documentation and
copyright notice?
- [ ] Is this contribution tested against [all ArcticDB's
features](../docs/mkdocs/docs/technical/contributing.md)?
- [ ] Do all exceptions introduced raise appropriate [error
messages](https://docs.arcticdb.io/error_messages/)?
- [ ] Are API changes highlighted in the PR description?
- [ ] Is the PR labelled as enhancement or bug so it appears in
autogenerated release notes?
---
.github/workflows/build.yml | 6 +-
cpp/arcticdb/arrow/array_from_block.hpp | 105 +-
cpp/arcticdb/arrow/arrow_handlers.cpp | 93 +-
cpp/arcticdb/arrow/arrow_handlers.hpp | 56 +-
cpp/arcticdb/arrow/arrow_output_frame.cpp | 21 +-
cpp/arcticdb/arrow/arrow_output_frame.hpp | 19 +-
cpp/arcticdb/arrow/arrow_utils.cpp | 33 +-
cpp/arcticdb/arrow/arrow_utils.hpp | 3 +-
cpp/arcticdb/arrow/test/test_arrow.cpp | 98 +-
cpp/arcticdb/async/async_store.cpp | 17 +-
cpp/arcticdb/async/async_store.hpp | 705 +++---
cpp/arcticdb/async/base_task.hpp | 5 +-
cpp/arcticdb/async/batch_read_args.hpp | 8 +-
cpp/arcticdb/async/bit_rate_stats.cpp | 66 +-
cpp/arcticdb/async/bit_rate_stats.hpp | 33 +-
cpp/arcticdb/async/python_bindings.cpp | 24 +-
cpp/arcticdb/async/python_bindings.hpp | 7 +-
cpp/arcticdb/async/task_scheduler.cpp | 55 +-
cpp/arcticdb/async/task_scheduler.hpp | 201 +-
cpp/arcticdb/async/tasks.cpp | 78 +-
cpp/arcticdb/async/tasks.hpp | 396 +--
cpp/arcticdb/async/test/test_async.cpp | 339 +--
cpp/arcticdb/codec/codec-inl.hpp | 106 +-
cpp/arcticdb/codec/codec.cpp | 372 ++-
cpp/arcticdb/codec/codec.hpp | 72 +-
cpp/arcticdb/codec/core.hpp | 175 +-
cpp/arcticdb/codec/default_codecs.hpp | 9 +-
cpp/arcticdb/codec/encode_common.hpp | 121 +-
cpp/arcticdb/codec/encode_v1.cpp | 274 ++-
cpp/arcticdb/codec/encode_v2.cpp | 201 +-
cpp/arcticdb/codec/encoded_field.cpp | 11 +-
cpp/arcticdb/codec/encoded_field.hpp | 319 +--
.../codec/encoded_field_collection.hpp | 92 +-
cpp/arcticdb/codec/encoding_sizes.hpp | 82 +-
cpp/arcticdb/codec/lz4.hpp | 83 +-
cpp/arcticdb/codec/magic_words.hpp | 21 +-
cpp/arcticdb/codec/passthrough.hpp | 77 +-
cpp/arcticdb/codec/protobuf_mappings.cpp | 33 +-
cpp/arcticdb/codec/protobuf_mappings.hpp | 28 +-
cpp/arcticdb/codec/python_bindings.cpp | 122 +-
cpp/arcticdb/codec/python_bindings.hpp | 10 +-
cpp/arcticdb/codec/segment.cpp | 241 +-
cpp/arcticdb/codec/segment.hpp | 166 +-
cpp/arcticdb/codec/segment_header.cpp | 61 +-
cpp/arcticdb/codec/segment_header.hpp | 226 +-
cpp/arcticdb/codec/segment_identifier.hpp | 43 +-
cpp/arcticdb/codec/slice_data_sink.hpp | 28 +-
cpp/arcticdb/codec/test/test_codec.cpp | 410 +--
.../test/test_encode_field_collection.cpp | 3 +-
.../codec/test/test_encoded_field.cpp | 20 +-
.../codec/test/test_segment_header.cpp | 17 +-
cpp/arcticdb/codec/tp4.hpp | 87 +-
.../codec/typed_block_encoder_impl.hpp | 266 +-
cpp/arcticdb/codec/zstd.hpp | 47 +-
cpp/arcticdb/column_store/block.hpp | 106 +-
cpp/arcticdb/column_store/chunked_buffer.cpp | 76 +-
cpp/arcticdb/column_store/chunked_buffer.hpp | 234 +-
cpp/arcticdb/column_store/column.cpp | 542 ++--
cpp/arcticdb/column_store/column.hpp | 516 ++--
cpp/arcticdb/column_store/column_data.cpp | 15 +-
cpp/arcticdb/column_store/column_data.hpp | 313 +--
.../column_data_random_accessor.hpp | 90 +-
cpp/arcticdb/column_store/column_map.hpp | 18 +-
cpp/arcticdb/column_store/column_utils.hpp | 73 +-
cpp/arcticdb/column_store/key_segment.cpp | 102 +-
cpp/arcticdb/column_store/key_segment.hpp | 21 +-
cpp/arcticdb/column_store/memory_segment.cpp | 308 +--
cpp/arcticdb/column_store/memory_segment.hpp | 82 +-
.../column_store/memory_segment_impl.cpp | 597 +++--
.../column_store/memory_segment_impl.hpp | 292 ++-
cpp/arcticdb/column_store/python_bindings.cpp | 21 +-
cpp/arcticdb/column_store/python_bindings.hpp | 10 +-
cpp/arcticdb/column_store/row_ref.hpp | 27 +-
cpp/arcticdb/column_store/segment_utils.hpp | 13 +-
cpp/arcticdb/column_store/statistics.hpp | 82 +-
cpp/arcticdb/column_store/string_pool.cpp | 124 +-
cpp/arcticdb/column_store/string_pool.hpp | 73 +-
.../column_store/test/benchmark_column.cpp | 19 +-
.../test/benchmark_memory_segment.cpp | 40 +-
.../test/ingestion_stress_test.cpp | 105 +-
.../test/rapidcheck_chunked_buffer.cpp | 36 +-
.../column_store/test/rapidcheck_column.cpp | 62 +-
...rapidcheck_column_data_random_accessor.cpp | 7 +-
.../test/rapidcheck_column_map.cpp | 14 +-
.../test/rapidcheck_column_store.cpp | 17 +-
.../column_store/test/test_chunked_buffer.cpp | 17 +-
.../column_store/test/test_column.cpp | 65 +-
.../test/test_column_data_random_accessor.cpp | 7 +-
.../test/test_index_filtering.cpp | 96 +-
.../column_store/test/test_memory_segment.cpp | 187 +-
.../column_store/test/test_statistics.cpp | 10 +-
cpp/arcticdb/entity/atom_key.hpp | 224 +-
cpp/arcticdb/entity/data_error.cpp | 65 +-
cpp/arcticdb/entity/data_error.hpp | 28 +-
cpp/arcticdb/entity/descriptor_item.hpp | 16 +-
cpp/arcticdb/entity/descriptors.hpp | 2 +-
cpp/arcticdb/entity/field_collection.cpp | 77 +-
cpp/arcticdb/entity/field_collection.hpp | 162 +-
.../entity/field_collection_proto.cpp | 6 +-
.../entity/field_collection_proto.hpp | 5 +-
cpp/arcticdb/entity/frame_and_descriptor.hpp | 5 +-
cpp/arcticdb/entity/index_range.hpp | 78 +-
cpp/arcticdb/entity/key.cpp | 121 +-
cpp/arcticdb/entity/key.hpp | 120 +-
cpp/arcticdb/entity/merge_descriptors.cpp | 119 +-
cpp/arcticdb/entity/merge_descriptors.hpp | 55 +-
cpp/arcticdb/entity/metrics.cpp | 230 +-
cpp/arcticdb/entity/metrics.hpp | 205 +-
cpp/arcticdb/entity/native_tensor.hpp | 137 +-
cpp/arcticdb/entity/output_format.hpp | 13 +-
cpp/arcticdb/entity/performance_tracing.cpp | 48 +-
cpp/arcticdb/entity/performance_tracing.hpp | 98 +-
cpp/arcticdb/entity/protobuf_mappings.cpp | 89 +-
cpp/arcticdb/entity/protobuf_mappings.hpp | 24 +-
cpp/arcticdb/entity/protobufs.hpp | 29 +-
cpp/arcticdb/entity/read_result.hpp | 75 +-
cpp/arcticdb/entity/ref_key.hpp | 137 +-
cpp/arcticdb/entity/serialized_key.hpp | 273 +-
cpp/arcticdb/entity/stage_result.hpp | 8 +-
cpp/arcticdb/entity/stream_descriptor.hpp | 245 +-
cpp/arcticdb/entity/test/test_atom_key.cpp | 93 +-
.../entity/test/test_field_collection.cpp | 5 +-
.../entity/test/test_key_serialization.cpp | 98 +-
cpp/arcticdb/entity/test/test_metrics.cpp | 8 +-
cpp/arcticdb/entity/test/test_ref_key.cpp | 5 +-
.../entity/test/test_stream_descriptor.cpp | 15 +-
cpp/arcticdb/entity/test/test_tensor.cpp | 158 +-
cpp/arcticdb/entity/timeseries_descriptor.hpp | 156 +-
cpp/arcticdb/entity/type_conversion.hpp | 269 +-
cpp/arcticdb/entity/type_utils.cpp | 411 ++--
cpp/arcticdb/entity/type_utils.hpp | 39 +-
cpp/arcticdb/entity/types-inl.hpp | 83 +-
cpp/arcticdb/entity/types.cpp | 16 +-
cpp/arcticdb/entity/types.hpp | 502 ++--
cpp/arcticdb/entity/types_proto.cpp | 55 +-
cpp/arcticdb/entity/types_proto.hpp | 82 +-
cpp/arcticdb/entity/variant_key.hpp | 38 +-
cpp/arcticdb/entity/versioned_item.hpp | 14 +-
cpp/arcticdb/log/log.cpp | 247 +-
cpp/arcticdb/log/log.hpp | 68 +-
cpp/arcticdb/log/test/test_log.cpp | 7 +-
cpp/arcticdb/log/trace.hpp | 9 +-
cpp/arcticdb/pipeline/column_mapping.cpp | 220 +-
cpp/arcticdb/pipeline/column_mapping.hpp | 79 +-
cpp/arcticdb/pipeline/column_stats.cpp | 220 +-
cpp/arcticdb/pipeline/column_stats.hpp | 17 +-
cpp/arcticdb/pipeline/execution.hpp | 70 +-
cpp/arcticdb/pipeline/filter_segment.hpp | 20 +-
cpp/arcticdb/pipeline/frame_slice.cpp | 41 +-
cpp/arcticdb/pipeline/frame_slice.hpp | 211 +-
cpp/arcticdb/pipeline/frame_slice_map.hpp | 49 +-
cpp/arcticdb/pipeline/frame_utils.cpp | 120 +-
cpp/arcticdb/pipeline/frame_utils.hpp | 238 +-
cpp/arcticdb/pipeline/index_fields.hpp | 30 +-
.../pipeline/index_segment_reader.cpp | 119 +-
.../pipeline/index_segment_reader.hpp | 64 +-
cpp/arcticdb/pipeline/index_utils.cpp | 85 +-
cpp/arcticdb/pipeline/index_utils.hpp | 98 +-
cpp/arcticdb/pipeline/index_writer.hpp | 121 +-
cpp/arcticdb/pipeline/input_tensor_frame.hpp | 57 +-
cpp/arcticdb/pipeline/pandas_output_frame.hpp | 12 +-
cpp/arcticdb/pipeline/pipeline_common.hpp | 5 +-
cpp/arcticdb/pipeline/pipeline_context.cpp | 57 +-
cpp/arcticdb/pipeline/pipeline_context.hpp | 94 +-
cpp/arcticdb/pipeline/pipeline_utils.hpp | 32 +-
cpp/arcticdb/pipeline/query.cpp | 63 +-
cpp/arcticdb/pipeline/query.hpp | 318 +--
cpp/arcticdb/pipeline/read_frame.cpp | 826 ++++---
cpp/arcticdb/pipeline/read_frame.hpp | 83 +-
cpp/arcticdb/pipeline/read_options.hpp | 61 +-
cpp/arcticdb/pipeline/read_pipeline.cpp | 31 +-
cpp/arcticdb/pipeline/read_pipeline.hpp | 84 +-
cpp/arcticdb/pipeline/read_query.cpp | 21 +-
cpp/arcticdb/pipeline/read_query.hpp | 2 +-
cpp/arcticdb/pipeline/slicing.cpp | 80 +-
cpp/arcticdb/pipeline/slicing.hpp | 103 +-
cpp/arcticdb/pipeline/string_pool_utils.cpp | 5 +-
cpp/arcticdb/pipeline/string_pool_utils.hpp | 33 +-
cpp/arcticdb/pipeline/string_reducers.hpp | 133 +-
cpp/arcticdb/pipeline/test/test_container.hpp | 31 +-
.../pipeline/test/test_frame_allocation.cpp | 21 +-
cpp/arcticdb/pipeline/test/test_pipeline.cpp | 98 +-
cpp/arcticdb/pipeline/test/test_query.cpp | 42 +-
cpp/arcticdb/pipeline/test/test_value.cpp | 70 +-
cpp/arcticdb/pipeline/value.hpp | 86 +-
cpp/arcticdb/pipeline/value_set.cpp | 208 +-
cpp/arcticdb/pipeline/value_set.hpp | 38 +-
cpp/arcticdb/pipeline/write_frame.cpp | 322 +--
cpp/arcticdb/pipeline/write_frame.hpp | 74 +-
cpp/arcticdb/pipeline/write_options.hpp | 18 +-
.../processing/aggregation_interface.hpp | 18 +-
cpp/arcticdb/processing/aggregation_utils.cpp | 7 +-
cpp/arcticdb/processing/aggregation_utils.hpp | 3 +-
cpp/arcticdb/processing/bucketizer.hpp | 18 +-
cpp/arcticdb/processing/clause.cpp | 1245 +++++-----
cpp/arcticdb/processing/clause.hpp | 369 ++-
cpp/arcticdb/processing/clause_utils.cpp | 230 +-
cpp/arcticdb/processing/clause_utils.hpp | 180 +-
cpp/arcticdb/processing/component_manager.cpp | 11 +-
cpp/arcticdb/processing/component_manager.hpp | 102 +-
.../processing/expression_context.hpp | 22 +-
cpp/arcticdb/processing/expression_node.cpp | 478 ++--
cpp/arcticdb/processing/expression_node.hpp | 68 +-
cpp/arcticdb/processing/grouper.hpp | 26 +-
.../processing/operation_dispatch.cpp | 90 +-
.../processing/operation_dispatch.hpp | 5 +-
.../processing/operation_dispatch_binary.cpp | 244 +-
.../processing/operation_dispatch_binary.hpp | 668 +++--
.../operation_dispatch_binary_eq.cpp | 3 +-
.../operation_dispatch_binary_gt.cpp | 6 +-
.../operation_dispatch_binary_gte.cpp | 6 +-
.../operation_dispatch_binary_lt.cpp | 6 +-
.../operation_dispatch_binary_lte.cpp | 6 +-
.../operation_dispatch_binary_neq.cpp | 6 +-
...ration_dispatch_binary_operator_divide.cpp | 3 +-
...eration_dispatch_binary_operator_minus.cpp | 3 +-
...peration_dispatch_binary_operator_plus.cpp | 3 +-
...eration_dispatch_binary_operator_times.cpp | 3 +-
.../processing/operation_dispatch_ternary.cpp | 631 +++--
.../processing/operation_dispatch_ternary.hpp | 13 +-
.../processing/operation_dispatch_unary.cpp | 106 +-
.../processing/operation_dispatch_unary.hpp | 173 +-
cpp/arcticdb/processing/operation_types.hpp | 957 +++----
cpp/arcticdb/processing/processing_unit.cpp | 167 +-
cpp/arcticdb/processing/processing_unit.hpp | 386 +--
cpp/arcticdb/processing/query_planner.cpp | 25 +-
cpp/arcticdb/processing/query_planner.hpp | 19 +-
.../processing/signed_unsigned_comparison.hpp | 63 +-
.../processing/sorted_aggregation.cpp | 228 +-
.../processing/sorted_aggregation.hpp | 223 +-
cpp/arcticdb/processing/ternary_utils.hpp | 121 +-
.../processing/test/benchmark_binary.cpp | 7 +-
.../processing/test/benchmark_clause.cpp | 56 +-
.../processing/test/benchmark_common.cpp | 35 +-
.../processing/test/benchmark_common.hpp | 11 +-
.../processing/test/benchmark_projection.cpp | 3 +-
.../processing/test/benchmark_ternary.cpp | 33 +-
.../processing/test/rapidcheck_resample.cpp | 28 +-
.../test/test_arithmetic_type_promotion.cpp | 643 +++--
cpp/arcticdb/processing/test/test_clause.cpp | 185 +-
.../test/test_component_manager.cpp | 18 +-
.../processing/test/test_expression.cpp | 17 +-
.../test/test_filter_and_project_sparse.cpp | 92 +-
.../processing/test/test_join_schemas.cpp | 231 +-
.../test/test_operation_dispatch.cpp | 21 +-
.../test_output_schema_aggregator_types.cpp | 136 +-
.../test/test_output_schema_ast_validity.cpp | 197 +-
.../test/test_output_schema_basic.cpp | 3 +-
.../test/test_parallel_processing.cpp | 80 +-
.../processing/test/test_resample.cpp | 318 ++-
.../processing/test/test_set_membership.cpp | 3 +-
.../test/test_signed_unsigned_comparison.cpp | 3 +-
.../processing/test/test_type_comparison.cpp | 3 +-
.../processing/test/test_type_promotion.cpp | 13 +-
.../test/test_unsorted_aggregation.cpp | 51 +-
.../processing/unsorted_aggregation.cpp | 607 ++---
.../processing/unsorted_aggregation.hpp | 141 +-
cpp/arcticdb/python/adapt_read_dataframe.hpp | 10 +-
cpp/arcticdb/python/arctic_version.cpp | 9 +-
cpp/arcticdb/python/arctic_version.hpp | 5 +-
cpp/arcticdb/python/gil_lock.hpp | 14 +-
cpp/arcticdb/python/normalization_checks.cpp | 152 +-
cpp/arcticdb/python/normalization_checks.hpp | 11 +-
cpp/arcticdb/python/numpy_buffer_holder.hpp | 14 +-
cpp/arcticdb/python/python_handler_data.hpp | 46 +-
cpp/arcticdb/python/python_handlers.cpp | 268 +-
cpp/arcticdb/python/python_handlers.hpp | 145 +-
cpp/arcticdb/python/python_module.cpp | 301 +--
cpp/arcticdb/python/python_strings.cpp | 66 +-
cpp/arcticdb/python/python_strings.hpp | 160 +-
.../python/python_to_tensor_frame.cpp | 120 +-
.../python/python_to_tensor_frame.hpp | 40 +-
cpp/arcticdb/python/python_types.hpp | 20 +-
cpp/arcticdb/python/python_utils.cpp | 16 +-
cpp/arcticdb/python/python_utils.hpp | 206 +-
cpp/arcticdb/python/reader.hpp | 67 +-
cpp/arcticdb/storage/async_storage.hpp | 15 +-
.../storage/azure/azure_client_impl.cpp | 69 +-
.../storage/azure/azure_client_impl.hpp | 30 +-
.../storage/azure/azure_client_interface.hpp | 55 +-
cpp/arcticdb/storage/azure/azure_storage.cpp | 306 ++-
cpp/arcticdb/storage/azure/azure_storage.hpp | 32 +-
.../coalesced/multi_segment_header.hpp | 105 +-
.../storage/coalesced/multi_segment_utils.hpp | 146 +-
cpp/arcticdb/storage/common.hpp | 43 +-
cpp/arcticdb/storage/config_cache.hpp | 74 +-
cpp/arcticdb/storage/config_resolvers.cpp | 43 +-
cpp/arcticdb/storage/config_resolvers.hpp | 67 +-
cpp/arcticdb/storage/constants.hpp | 5 +-
cpp/arcticdb/storage/failure_simulation.hpp | 100 +-
cpp/arcticdb/storage/file/file_store.hpp | 107 +-
.../storage/file/mapped_file_storage.cpp | 81 +-
.../storage/file/mapped_file_storage.hpp | 46 +-
cpp/arcticdb/storage/key_segment_pair.hpp | 138 +-
cpp/arcticdb/storage/library.hpp | 84 +-
cpp/arcticdb/storage/library_index.hpp | 25 +-
cpp/arcticdb/storage/library_manager.cpp | 177 +-
cpp/arcticdb/storage/library_manager.hpp | 171 +-
cpp/arcticdb/storage/library_path.hpp | 90 +-
.../storage/lmdb/lmdb_client_impl.cpp | 30 +-
.../storage/lmdb/lmdb_client_impl.hpp | 37 +-
.../storage/lmdb/lmdb_client_interface.hpp | 36 +-
cpp/arcticdb/storage/lmdb/lmdb_storage.cpp | 129 +-
cpp/arcticdb/storage/lmdb/lmdb_storage.hpp | 32 +-
.../storage/memory/memory_storage.cpp | 65 +-
.../storage/memory/memory_storage.hpp | 77 +-
cpp/arcticdb/storage/memory_layout.hpp | 76 +-
.../storage/mock/azure_mock_client.cpp | 64 +-
.../storage/mock/azure_mock_client.hpp | 33 +-
.../storage/mock/lmdb_mock_client.cpp | 68 +-
.../storage/mock/lmdb_mock_client.hpp | 41 +-
.../storage/mock/mongo_mock_client.cpp | 105 +-
.../storage/mock/mongo_mock_client.hpp | 81 +-
cpp/arcticdb/storage/mock/s3_mock_client.cpp | 103 +-
cpp/arcticdb/storage/mock/s3_mock_client.hpp | 55 +-
.../storage/mock/storage_mock_client.hpp | 29 +-
cpp/arcticdb/storage/mongo/mongo_client.cpp | 250 +-
cpp/arcticdb/storage/mongo/mongo_client.hpp | 66 +-
.../storage/mongo/mongo_client_interface.hpp | 63 +-
cpp/arcticdb/storage/mongo/mongo_instance.cpp | 13 +-
cpp/arcticdb/storage/mongo/mongo_instance.hpp | 6 +-
cpp/arcticdb/storage/mongo/mongo_storage.cpp | 141 +-
cpp/arcticdb/storage/mongo/mongo_storage.hpp | 21 +-
cpp/arcticdb/storage/object_store_utils.hpp | 19 +-
cpp/arcticdb/storage/open_mode.hpp | 32 +-
cpp/arcticdb/storage/protobuf_mappings.hpp | 48 +-
cpp/arcticdb/storage/python_bindings.cpp | 539 ++--
cpp/arcticdb/storage/python_bindings.hpp | 7 +-
.../storage/s3/aws_provider_chain.cpp | 96 +-
.../storage/s3/aws_provider_chain.hpp | 13 +-
cpp/arcticdb/storage/s3/detail-inl.hpp | 506 ++--
cpp/arcticdb/storage/s3/ec2_utils.cpp | 56 +-
cpp/arcticdb/storage/s3/ec2_utils.hpp | 6 +-
.../storage/s3/nfs_backed_storage.cpp | 220 +-
.../storage/s3/nfs_backed_storage.hpp | 39 +-
cpp/arcticdb/storage/s3/s3_api.cpp | 29 +-
cpp/arcticdb/storage/s3/s3_api.hpp | 17 +-
cpp/arcticdb/storage/s3/s3_client_impl.cpp | 124 +-
cpp/arcticdb/storage/s3/s3_client_impl.hpp | 50 +-
.../storage/s3/s3_client_interface.hpp | 70 +-
cpp/arcticdb/storage/s3/s3_client_wrapper.cpp | 65 +-
cpp/arcticdb/storage/s3/s3_client_wrapper.hpp | 50 +-
cpp/arcticdb/storage/s3/s3_settings.hpp | 301 +--
cpp/arcticdb/storage/s3/s3_storage.cpp | 169 +-
cpp/arcticdb/storage/s3/s3_storage.hpp | 107 +-
cpp/arcticdb/storage/s3/s3_storage_tool.cpp | 127 +-
cpp/arcticdb/storage/s3/s3_storage_tool.hpp | 13 +-
cpp/arcticdb/storage/single_file_storage.hpp | 44 +-
cpp/arcticdb/storage/storage.hpp | 163 +-
cpp/arcticdb/storage/storage_exceptions.hpp | 107 +-
cpp/arcticdb/storage/storage_factory.cpp | 25 +-
cpp/arcticdb/storage/storage_factory.hpp | 19 +-
cpp/arcticdb/storage/storage_options.hpp | 5 +-
cpp/arcticdb/storage/storage_override.hpp | 182 +-
cpp/arcticdb/storage/storage_utils.cpp | 104 +-
cpp/arcticdb/storage/storage_utils.hpp | 21 +-
cpp/arcticdb/storage/storages.hpp | 172 +-
cpp/arcticdb/storage/store.hpp | 15 +-
cpp/arcticdb/storage/test/common.hpp | 37 +-
cpp/arcticdb/storage/test/in_memory_store.hpp | 482 ++--
.../storage/test/mongo_server_fixture.hpp | 18 +-
.../storage/test/test_azure_storage.cpp | 29 +-
.../storage/test/test_local_storages.cpp | 245 +-
.../storage/test/test_memory_storage.cpp | 15 +-
.../storage/test/test_multi_segment.cpp | 5 +-
cpp/arcticdb/storage/test/test_s3_storage.cpp | 177 +-
.../storage/test/test_storage_exceptions.cpp | 322 ++-
.../storage/test/test_storage_factory.cpp | 6 +-
.../storage/test/test_storage_operations.cpp | 99 +-
cpp/arcticdb/stream/aggregator-inl.hpp | 15 +-
cpp/arcticdb/stream/aggregator.cpp | 5 +-
cpp/arcticdb/stream/aggregator.hpp | 182 +-
cpp/arcticdb/stream/incompletes.cpp | 613 ++---
cpp/arcticdb/stream/incompletes.hpp | 137 +-
cpp/arcticdb/stream/index.cpp | 70 +-
cpp/arcticdb/stream/index.hpp | 80 +-
cpp/arcticdb/stream/index_aggregator.hpp | 52 +-
cpp/arcticdb/stream/merge.hpp | 67 +-
cpp/arcticdb/stream/merge_utils.hpp | 44 +-
cpp/arcticdb/stream/piloted_clock.hpp | 10 +-
cpp/arcticdb/stream/protobuf_mappings.cpp | 24 +-
cpp/arcticdb/stream/protobuf_mappings.hpp | 13 +-
cpp/arcticdb/stream/python_bindings.cpp | 472 ++--
cpp/arcticdb/stream/python_bindings.hpp | 12 +-
cpp/arcticdb/stream/row_builder.hpp | 155 +-
cpp/arcticdb/stream/schema.hpp | 79 +-
cpp/arcticdb/stream/segment_aggregator.hpp | 72 +-
cpp/arcticdb/stream/stream_reader.hpp | 68 +-
cpp/arcticdb/stream/stream_sink.hpp | 136 +-
cpp/arcticdb/stream/stream_source.hpp | 73 +-
cpp/arcticdb/stream/stream_utils.hpp | 302 +--
cpp/arcticdb/stream/stream_writer.hpp | 130 +-
.../stream/test/stream_test_common.cpp | 7 +-
.../stream/test/stream_test_common.hpp | 172 +-
cpp/arcticdb/stream/test/test_aggregator.cpp | 35 +-
cpp/arcticdb/stream/test/test_incompletes.cpp | 91 +-
.../stream/test/test_protobuf_mappings.cpp | 4 +-
cpp/arcticdb/stream/test/test_row_builder.cpp | 32 +-
.../stream/test/test_segment_aggregator.cpp | 31 +-
cpp/arcticdb/stream/test/test_types.cpp | 27 +-
cpp/arcticdb/toolbox/library_tool.cpp | 98 +-
cpp/arcticdb/toolbox/library_tool.hpp | 24 +-
cpp/arcticdb/toolbox/python_bindings.cpp | 120 +-
cpp/arcticdb/toolbox/python_bindings.hpp | 7 +-
cpp/arcticdb/toolbox/query_stats.cpp | 115 +-
cpp/arcticdb/toolbox/query_stats.hpp | 43 +-
cpp/arcticdb/toolbox/storage_mover.hpp | 501 ++--
cpp/arcticdb/util/allocation_tracing.cpp | 18 +-
cpp/arcticdb/util/allocation_tracing.hpp | 20 +-
cpp/arcticdb/util/allocator.cpp | 554 ++---
cpp/arcticdb/util/allocator.hpp | 42 +-
cpp/arcticdb/util/bitset.hpp | 13 +-
cpp/arcticdb/util/buffer.hpp | 245 +-
cpp/arcticdb/util/buffer_pool.cpp | 20 +-
cpp/arcticdb/util/buffer_pool.hpp | 19 +-
cpp/arcticdb/util/clock.hpp | 22 +-
cpp/arcticdb/util/composite.hpp | 404 ++-
cpp/arcticdb/util/configs_map.hpp | 53 +-
cpp/arcticdb/util/constants.hpp | 5 +-
cpp/arcticdb/util/constructors.hpp | 55 +-
.../util/container_filter_wrapper.hpp | 18 +-
cpp/arcticdb/util/cursor.hpp | 47 +-
cpp/arcticdb/util/cursored_buffer.hpp | 92 +-
cpp/arcticdb/util/decimal.cpp | 550 ++---
cpp/arcticdb/util/decimal.hpp | 105 +-
cpp/arcticdb/util/decode_path_data.hpp | 20 +-
cpp/arcticdb/util/dump_bytes.hpp | 18 +-
cpp/arcticdb/util/encoding_conversion.hpp | 40 +-
cpp/arcticdb/util/error_code.cpp | 28 +-
cpp/arcticdb/util/error_code.hpp | 154 +-
cpp/arcticdb/util/exponential_backoff.hpp | 37 +-
cpp/arcticdb/util/flatten_utils.hpp | 39 +-
cpp/arcticdb/util/format_bytes.hpp | 13 +-
cpp/arcticdb/util/format_date.cpp | 43 +-
cpp/arcticdb/util/format_date.hpp | 3 +-
cpp/arcticdb/util/global_lifetimes.cpp | 9 +-
cpp/arcticdb/util/global_lifetimes.hpp | 8 +-
cpp/arcticdb/util/hash.hpp | 22 +-
cpp/arcticdb/util/home_directory.hpp | 14 +-
cpp/arcticdb/util/key_utils.hpp | 212 +-
cpp/arcticdb/util/lazy.hpp | 10 +-
cpp/arcticdb/util/lock_table.hpp | 27 +-
cpp/arcticdb/util/lru_cache.hpp | 13 +-
cpp/arcticdb/util/magic_num.hpp | 35 +-
cpp/arcticdb/util/memory_mapped_file.hpp | 68 +-
cpp/arcticdb/util/memory_tracing.hpp | 78 +-
cpp/arcticdb/util/movable_priority_queue.hpp | 18 +-
cpp/arcticdb/util/name_validation.cpp | 135 +-
cpp/arcticdb/util/name_validation.hpp | 7 +-
cpp/arcticdb/util/native_handler.hpp | 16 +-
cpp/arcticdb/util/offset_string.cpp | 18 +-
cpp/arcticdb/util/offset_string.hpp | 19 +-
cpp/arcticdb/util/optional_defaults.hpp | 11 +-
cpp/arcticdb/util/pb_util.hpp | 17 +-
cpp/arcticdb/util/preconditions.hpp | 155 +-
cpp/arcticdb/util/preprocess.hpp | 9 +-
cpp/arcticdb/util/pybind_mutex.hpp | 50 +-
cpp/arcticdb/util/python_bindings.cpp | 10 +-
cpp/arcticdb/util/python_bindings.hpp | 7 +-
cpp/arcticdb/util/ranges_from_future.hpp | 16 +-
cpp/arcticdb/util/regex_filter.hpp | 115 +-
.../util/reliable_storage_lock-inl.hpp | 213 +-
cpp/arcticdb/util/reliable_storage_lock.hpp | 43 +-
cpp/arcticdb/util/simple_string_hash.hpp | 6 +-
cpp/arcticdb/util/slab_allocator.hpp | 93 +-
cpp/arcticdb/util/sparse_utils.cpp | 20 +-
cpp/arcticdb/util/sparse_utils.hpp | 72 +-
cpp/arcticdb/util/spinlock.hpp | 11 +-
cpp/arcticdb/util/storage_lock.hpp | 114 +-
cpp/arcticdb/util/string_utils.cpp | 28 +-
cpp/arcticdb/util/string_utils.hpp | 8 +-
cpp/arcticdb/util/string_wrapping_value.hpp | 40 +-
cpp/arcticdb/util/test/config_common.hpp | 19 +-
cpp/arcticdb/util/test/generators.hpp | 270 +-
cpp/arcticdb/util/test/gtest.hpp | 14 +-
cpp/arcticdb/util/test/gtest_main.cpp | 7 +-
cpp/arcticdb/util/test/gtest_utils.hpp | 13 +-
cpp/arcticdb/util/test/random_throw.hpp | 16 +-
cpp/arcticdb/util/test/rapidcheck.hpp | 6 +-
cpp/arcticdb/util/test/rapidcheck_decimal.cpp | 16 +-
.../util/test/rapidcheck_generators.cpp | 8 +-
.../util/test/rapidcheck_generators.hpp | 192 +-
.../util/test/rapidcheck_lru_cache.cpp | 1 -
cpp/arcticdb/util/test/rapidcheck_main.cpp | 5 +-
.../util/test/rapidcheck_string_pool.cpp | 9 +-
cpp/arcticdb/util/test/test_bitmagic.cpp | 17 +-
cpp/arcticdb/util/test/test_buffer_pool.cpp | 17 +-
cpp/arcticdb/util/test/test_composite.cpp | 22 +-
cpp/arcticdb/util/test/test_cursor.cpp | 14 +-
cpp/arcticdb/util/test/test_error_code.cpp | 17 +-
.../util/test/test_exponential_backoff.cpp | 25 +-
cpp/arcticdb/util/test/test_folly.cpp | 34 +-
cpp/arcticdb/util/test/test_format_date.cpp | 11 +-
cpp/arcticdb/util/test/test_hash.cpp | 139 +-
.../util/test/test_id_transformation.cpp | 9 +-
cpp/arcticdb/util/test/test_key_utils.cpp | 11 +-
.../util/test/test_ranges_from_future.cpp | 3 +-
cpp/arcticdb/util/test/test_regex.cpp | 10 +-
.../util/test/test_reliable_storage_lock.cpp | 49 +-
.../util/test/test_slab_allocator.cpp | 72 +-
cpp/arcticdb/util/test/test_storage_lock.cpp | 98 +-
cpp/arcticdb/util/test/test_string_pool.cpp | 12 +-
cpp/arcticdb/util/test/test_string_utils.cpp | 3 +-
.../util/test/test_tracing_allocator.cpp | 5 +-
cpp/arcticdb/util/test/test_utils.hpp | 132 +-
cpp/arcticdb/util/thread_cached_int.hpp | 100 +-
cpp/arcticdb/util/timeouts.hpp | 5 +-
cpp/arcticdb/util/timer.hpp | 101 +-
cpp/arcticdb/util/trace.cpp | 23 +-
cpp/arcticdb/util/trace.hpp | 5 +-
cpp/arcticdb/util/type_handler.cpp | 29 +-
cpp/arcticdb/util/type_handler.hpp | 90 +-
cpp/arcticdb/util/type_traits.hpp | 5 +-
cpp/arcticdb/util/variant.hpp | 35 +-
cpp/arcticdb/version/de_dup_map.hpp | 21 +-
cpp/arcticdb/version/key_block.cpp | 64 +-
cpp/arcticdb/version/key_block.hpp | 13 +-
.../version/local_versioned_engine.cpp | 1884 +++++++-------
.../version/local_versioned_engine.hpp | 389 ++-
cpp/arcticdb/version/op_log.cpp | 74 +-
cpp/arcticdb/version/op_log.hpp | 86 +-
cpp/arcticdb/version/python_bindings.cpp | 1446 ++++++-----
cpp/arcticdb/version/python_bindings.hpp | 7 +-
cpp/arcticdb/version/schema_checks.cpp | 131 +-
cpp/arcticdb/version/schema_checks.hpp | 42 +-
cpp/arcticdb/version/snapshot.cpp | 199 +-
cpp/arcticdb/version/snapshot.hpp | 71 +-
cpp/arcticdb/version/symbol_list.cpp | 618 ++---
cpp/arcticdb/version/symbol_list.hpp | 141 +-
cpp/arcticdb/version/test/benchmark_write.cpp | 8 +-
.../version/test/rapidcheck_version_map.cpp | 195 +-
.../test/symbol_list_backwards_compat.hpp | 146 +-
cpp/arcticdb/version/test/test_append.cpp | 16 +-
cpp/arcticdb/version/test/test_key_block.cpp | 51 +-
cpp/arcticdb/version/test/test_sort_index.cpp | 28 +-
.../test/test_sorting_info_state_machine.cpp | 3 +-
cpp/arcticdb/version/test/test_sparse.cpp | 397 +--
.../version/test/test_symbol_list.cpp | 435 ++--
.../version/test/test_version_common.hpp | 4 +-
.../version/test/test_version_map.cpp | 877 ++++---
.../version/test/test_version_map_batch.cpp | 112 +-
.../version/test/test_version_store.cpp | 524 ++--
.../version/test/version_backwards_compat.hpp | 30 +-
.../version/test/version_map_model.hpp | 83 +-
cpp/arcticdb/version/version_constants.hpp | 33 +-
cpp/arcticdb/version/version_core.cpp | 2192 +++++++++--------
cpp/arcticdb/version/version_core.hpp | 309 +--
cpp/arcticdb/version/version_functions.hpp | 300 ++-
cpp/arcticdb/version/version_log.hpp | 77 +-
cpp/arcticdb/version/version_map.hpp | 657 ++---
.../version/version_map_batch_methods.cpp | 291 ++-
.../version/version_map_batch_methods.hpp | 310 ++-
cpp/arcticdb/version/version_map_entry.hpp | 442 ++--
cpp/arcticdb/version/version_store_api.cpp | 1020 ++++----
cpp/arcticdb/version/version_store_api.hpp | 370 ++-
.../version/version_store_objects.hpp | 15 +-
cpp/arcticdb/version/version_tasks.hpp | 134 +-
cpp/arcticdb/version/version_utils.cpp | 46 +-
cpp/arcticdb/version/version_utils.hpp | 239 +-
cpp/arcticdb/version/versioned_engine.hpp | 123 +-
python/.asv/results/benchmarks.json | 450 ++--
python/arcticdb/__init__.py | 2 +-
python/arcticdb/_msgpack_compat.py | 15 +-
.../adapters/arctic_library_adapter.py | 20 +-
.../adapters/azure_library_adapter.py | 11 +-
.../adapters/gcpxml_library_adapter.py | 35 +-
.../adapters/in_memory_library_adapter.py | 1 +
.../arcticdb/adapters/lmdb_library_adapter.py | 1 +
.../adapters/mongo_library_adapter.py | 1 +
.../prefixing_library_adapter_decorator.py | 1 +
python/arcticdb/arctic.py | 54 +-
python/arcticdb/authorization/permissions.py | 1 +
python/arcticdb/config.py | 1 +
python/arcticdb/dependencies.py | 6 +-
python/arcticdb/encoding_version.py | 1 +
python/arcticdb/exceptions.py | 1 +
python/arcticdb/file.py | 34 +-
python/arcticdb/flattener.py | 18 +-
python/arcticdb/log.py | 3 +-
python/arcticdb/options.py | 23 +-
python/arcticdb/preconditions.py | 1 +
python/arcticdb/scripts/update_storage.py | 4 +-
python/arcticdb/storage_fixtures/mongo.py | 12 +-
python/arcticdb/storage_fixtures/s3.py | 12 +-
python/arcticdb/supported_types.py | 1 +
python/arcticdb/toolbox/library_tool.py | 16 +-
python/arcticdb/toolbox/query_stats.py | 31 +-
python/arcticdb/tools.py | 1 +
python/arcticdb/util/arctic_simulator.py | 62 +-
python/arcticdb/util/arrow.py | 1 +
python/arcticdb/util/environment_setup.py | 343 +--
python/arcticdb/util/hypothesis.py | 10 +-
python/arcticdb/util/logger.py | 30 +-
python/arcticdb/util/marks.py | 3 +-
python/arcticdb/util/test.py | 95 +-
python/arcticdb/util/utils.py | 727 +++---
python/arcticdb/version_store/_common.py | 1 +
.../version_store/_custom_normalizers.py | 1 +
.../arcticdb/version_store/_normalization.py | 91 +-
python/arcticdb/version_store/_store.py | 107 +-
python/arcticdb/version_store/admin_tools.py | 3 +-
python/arcticdb/version_store/helper.py | 26 +-
python/arcticdb/version_store/library.py | 40 +-
python/arcticdb/version_store/processing.py | 109 +-
python/arcticdb/version_store/read_result.py | 4 +-
python/benchmarks/arrow.py | 19 +-
python/benchmarks/basic_functions.py | 6 +-
python/benchmarks/bi_benchmarks.py | 75 +-
python/benchmarks/common.py | 150 +-
python/benchmarks/comparison_benchmarks.py | 2 +-
python/benchmarks/finalize_staged_data.py | 15 +-
.../non_asv/profile_billion_row_challenge.py | 5 +-
python/benchmarks/non_asv/profile_resample.py | 57 +-
python/benchmarks/real_batch_functions.py | 55 +-
.../benchmarks/real_comparison_benchmarks.py | 74 +-
.../benchmarks/real_finalize_staged_data.py | 17 +-
python/benchmarks/real_list_operations.py | 72 +-
python/benchmarks/real_query_builder.py | 40 +-
python/benchmarks/real_read_write.py | 109 +-
python/benchmarks/resample.py | 2 +-
python/benchmarks/version_chain.py | 2 +-
python/installation_tests/client_utils.py | 74 +-
python/installation_tests/conftest.py | 38 +-
.../installation_tests/test_installation.py | 60 +-
.../compat/arcticdb/test_compatibility.py | 71 +-
.../tests/compat/arcticdb/test_lib_naming.py | 11 +-
python/tests/conftest.py | 59 +-
python/tests/enduser/test_authentication.py | 158 +-
.../arcticdb/test_aggregation_hypothesis.py | 27 +-
.../arcticdb/test_hypothesis_version_store.py | 1 +
.../hypothesis/arcticdb/test_resample.py | 79 +-
.../hypothesis/arcticdb/test_sort_merge.py | 59 +-
.../integration/arcticdb/test_admin_tools.py | 23 +-
.../tests/integration/arcticdb/test_arctic.py | 14 +-
.../integration/arcticdb/test_arctic_batch.py | 25 +-
.../arcticdb/test_finalize_staged_data.py | 332 +--
.../arcticdb/test_persistent_storage.py | 24 +-
.../arcticdb/test_read_batch_more.py | 296 +--
.../integration/arcticdb/test_storage_lock.py | 9 +-
.../arcticdb/test_unicode_strings.py | 21 +-
.../tests/integration/arcticdb/test_update.py | 296 +--
.../test_basic_operations_scenarios.py | 328 +--
.../version_store/test_basic_version_store.py | 42 +-
.../version_store/test_categorical.py | 1 +
.../version_store/test_file_config.py | 1 +
.../version_store/test_metadata_support.py | 14 +-
.../version_store/test_pandas_support.py | 1 +
.../version_store/test_symbol_sizes.py | 62 +-
.../test_update_with_date_range.py | 1 +
.../integration/storage_fixtures/test_s3.py | 24 +-
.../integration/toolbox/test_library_tool.py | 6 +-
.../adapters/test_lmdb_library_adapter.py | 1 +
.../version_store/test_descriptor_compat.py | 15 +-
.../version_store/test_nonreg_processing.py | 11 +-
.../test_nonreg_prune_previous.py | 3 +-
.../version_store/test_nonreg_sort_merge.py | 21 +-
.../version_store/test_nonreg_specific.py | 42 +-
python/tests/pytest_xfail.py | 41 +-
.../stress/arcticdb/test_stress_strings.py | 35 +-
.../version_store/test_deallocation.py | 6 +-
.../version_store/test_long_running.py | 1 +
.../arcticdb/version_store/test_mem_leaks.py | 26 +-
.../arcticdb/version_store/test_sparse.py | 4 +-
.../version_store/test_stress_append.py | 1 +
.../test_stress_dynamic_bucketize.py | 1 +
.../version_store/test_stress_multicolumn.py | 1 +
.../test_stress_sort_and_finalize.py | 85 +-
.../test_stress_symbol_list_cache.py | 28 +-
.../test_stress_write_and_reread.py | 1 -
python/tests/unit/arcticdb/test_arrow_api.py | 21 +-
python/tests/unit/arcticdb/test_config.py | 1 +
.../unit/arcticdb/test_defrag_timeseries.py | 28 +-
python/tests/unit/arcticdb/test_env_vars.py | 1 +
python/tests/unit/arcticdb/test_file_io.py | 7 +-
python/tests/unit/arcticdb/test_flattener.py | 9 +-
.../unit/arcticdb/test_library_adapters.py | 59 +-
.../unit/arcticdb/test_msgpack_compact.py | 5 +-
.../tests/unit/arcticdb/test_permissions.py | 1 +
python/tests/unit/arcticdb/test_string.py | 2 +-
python/tests/unit/arcticdb/test_write_read.py | 1 +
.../pickles_generation/python2_pickles.py | 3 +-
.../version_store/test_aggregation.py | 94 +-
.../unit/arcticdb/version_store/test_api.py | 5 +-
.../arcticdb/version_store/test_append.py | 66 +-
.../version_store/test_array_column_type.py | 3 +-
.../unit/arcticdb/version_store/test_arrow.py | 230 +-
.../version_store/test_arrow_normalization.py | 88 +-
.../version_store/test_column_type_changes.py | 100 +-
.../arcticdb/version_store/test_date_range.py | 1 +
.../version_store/test_empty_column_type.py | 622 +++--
.../version_store/test_empty_writes.py | 28 +-
.../arcticdb/version_store/test_engine.py | 1 +
.../arcticdb/version_store/test_filtering.py | 101 +-
.../test_filtering_hypothesis.py | 65 +-
.../unit/arcticdb/version_store/test_head.py | 1 +
.../version_store/test_incompletes.py | 30 +-
.../version_store/test_lazy_dataframe.py | 73 +-
.../version_store/test_missing_empty.py | 1273 +++++++---
.../version_store/test_normalization.py | 195 +-
.../test_nullable_boolean_column_type.py | 4 +
.../version_store/test_observation_time.py | 1 +
.../arcticdb/version_store/test_parallel.py | 122 +-
.../version_store/test_pickle_atomkey.py | 172 +-
.../arcticdb/version_store/test_projection.py | 24 +-
.../test_projection_hypothesis.py | 15 +-
.../version_store/test_query_builder.py | 97 +-
.../version_store/test_query_builder_batch.py | 1 +
.../test_query_builder_sparse.py | 11 +-
.../arcticdb/version_store/test_read_index.py | 152 +-
.../test_recursive_normalizers.py | 40 +-
.../arcticdb/version_store/test_resample.py | 538 ++--
.../arcticdb/version_store/test_row_range.py | 24 +-
.../arcticdb/version_store/test_sort_merge.py | 552 +++--
.../unit/arcticdb/version_store/test_stage.py | 189 +-
.../version_store/test_string_dedup.py | 1 +
.../test_symbol_concatenation.py | 172 +-
.../unit/arcticdb/version_store/test_tail.py | 1 +
.../arcticdb/version_store/test_ternary.py | 203 +-
.../arcticdb/version_store/test_unicode.py | 33 +-
.../arcticdb/version_store/test_update.py | 322 +--
.../version_store/test_version_chain.py | 15 +-
.../unit/arcticdb/version_store/test_write.py | 2 +
.../unit/simulator/test_symbol_simulator.py | 270 +-
python/tests/util/date.py | 8 +-
python/tests/util/mark.py | 51 +-
python/tests/util/storage_test.py | 46 +-
python/utils/asv_checks.py | 79 +-
python/utils/s3_roles_delete.py | 65 +-
python/utils/test.py | 11 +-
729 files changed, 40579 insertions(+), 36971 deletions(-)
mode change 100755 => 100644 cpp/arcticdb/python/gil_lock.hpp
mode change 100755 => 100644 cpp/arcticdb/python/numpy_buffer_holder.hpp
mode change 100755 => 100644 cpp/arcticdb/version/test/symbol_list_backwards_compat.hpp
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7ff0e3ce2f..130e21a470 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -144,14 +144,12 @@ jobs:
- name: Lint Python
if: always()
run: |
- python3 build_tooling/format.py --check --type python \
- || true # formatting not enforced yet
+ python3 build_tooling/format.py --check --type python
- name: Lint C++
if: always()
run: |
- python3 build_tooling/format.py --check --type cpp \
- || true # formatting not enforced yet
+ python3 build_tooling/format.py --check --type cpp
common_config:
needs: [cibw_docker_image]
diff --git a/cpp/arcticdb/arrow/array_from_block.hpp b/cpp/arcticdb/arrow/array_from_block.hpp
index aa10b8ed03..c9d7535829 100644
--- a/cpp/arcticdb/arrow/array_from_block.hpp
+++ b/cpp/arcticdb/arrow/array_from_block.hpp
@@ -2,7 +2,8 @@
*
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
*
- * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
+ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
+ * will be governed by the Apache License, version 2.0.
*/
#pragma once
@@ -13,77 +14,78 @@
namespace arcticdb {
-inline std::optional create_validity_bitmap(size_t offset, const Column& column, size_t bitmap_size) {
- if(column.has_extra_buffer(offset, ExtraBufferType::BITMAP)) {
- auto &bitmap_buffer = column.get_extra_buffer(offset, ExtraBufferType::BITMAP);
- return sparrow::validity_bitmap{reinterpret_cast(bitmap_buffer.block(0)->release()), bitmap_size};
+inline std::optional create_validity_bitmap(
+ size_t offset, const Column& column, size_t bitmap_size
+) {
+ if (column.has_extra_buffer(offset, ExtraBufferType::BITMAP)) {
+ auto& bitmap_buffer = column.get_extra_buffer(offset, ExtraBufferType::BITMAP);
+ return sparrow::validity_bitmap{reinterpret_cast(bitmap_buffer.block(0)->release()), bitmap_size};
} else {
return std::nullopt;
}
}
-template
+template
sparrow::primitive_array create_primitive_array(
- T* data_ptr,
- size_t data_size,
- std::optional&& validity_bitmap) {
+ T* data_ptr, size_t data_size, std::optional&& validity_bitmap
+) {
sparrow::u8_buffer buffer(data_ptr, data_size);
- if(validity_bitmap) {
+ if (validity_bitmap) {
return sparrow::primitive_array{std::move(buffer), data_size, std::move(*validity_bitmap)};
} else {
return sparrow::primitive_array{std::move(buffer), data_size};
}
}
-template <>
+template<>
inline sparrow::primitive_array create_primitive_array(
- bool* data_ptr,
- size_t data_size,
- std::optional&& validity_bitmap) {
+ bool* data_ptr, size_t data_size, std::optional&& validity_bitmap
+) {
// We need special handling for bools because arrow uses dense bool representation (i.e. 8 bools per byte)
// Our internal representation is not dense. We use sparrow's `make_data_buffer` utility, but if needed, we can use
// our own.
auto buffer = sparrow::details::primitive_data_access::make_data_buffer(std::span{data_ptr, data_size});
- if(validity_bitmap) {
+ if (validity_bitmap) {
return sparrow::primitive_array{std::move(buffer), data_size, std::move(*validity_bitmap)};
} else {
return sparrow::primitive_array{std::move(buffer), data_size};
}
}
-template
+template
sparrow::timestamp_without_timezone_nanoseconds_array create_timestamp_array(
- T* data_ptr,
- size_t data_size,
- std::optional&& validity_bitmap) {
+ T* data_ptr, size_t data_size, std::optional&& validity_bitmap
+) {
static_assert(sizeof(T) == sizeof(sparrow::zoned_time_without_timezone_nanoseconds));
// We default to using timestamps without timezones. If the normalization metadata contains a timezone it will be
// applied during normalization in python layer.
sparrow::u8_buffer buffer(
- reinterpret_cast(data_ptr), data_size);
- if(validity_bitmap) {
- return sparrow::timestamp_without_timezone_nanoseconds_array{std::move(buffer), data_size, std::move(*validity_bitmap)};
+ reinterpret_cast(data_ptr), data_size
+ );
+ if (validity_bitmap) {
+ return sparrow::timestamp_without_timezone_nanoseconds_array{
+ std::move(buffer), data_size, std::move(*validity_bitmap)
+ };
} else {
return sparrow::timestamp_without_timezone_nanoseconds_array{std::move(buffer), data_size};
}
}
-template
+template
sparrow::dictionary_encoded_array create_dict_array(
- sparrow::array&& dict_values_array,
- sparrow::u8_buffer&& dict_keys_buffer,
- std::optional&& validity_bitmap
- ) {
- if(validity_bitmap) {
+ sparrow::array&& dict_values_array, sparrow::u8_buffer&& dict_keys_buffer,
+ std::optional&& validity_bitmap
+) {
+ if (validity_bitmap) {
return sparrow::dictionary_encoded_array{
- typename sparrow::dictionary_encoded_array::keys_buffer_type(std::move(dict_keys_buffer)),
- std::move(dict_values_array),
- std::move(*validity_bitmap)
+ typename sparrow::dictionary_encoded_array::keys_buffer_type(std::move(dict_keys_buffer)),
+ std::move(dict_values_array),
+ std::move(*validity_bitmap)
};
} else {
return sparrow::dictionary_encoded_array{
- typename sparrow::dictionary_encoded_array::keys_buffer_type(std::move(dict_keys_buffer)),
- std::move(dict_values_array),
+ typename sparrow::dictionary_encoded_array::keys_buffer_type(std::move(dict_keys_buffer)),
+ std::move(dict_values_array),
};
}
}
@@ -102,12 +104,11 @@ inline sparrow::big_string_array minimal_strings_dict() {
return {std::move(strings_buffer), std::move(offsets_buffer)};
}
-template
+template
sparrow::array string_dict_from_block(
- TypedBlockData& block,
- const Column& column,
- std::string_view name,
- std::optional&& maybe_bitmap) {
+ TypedBlockData& block, const Column& column, std::string_view name,
+ std::optional&& maybe_bitmap
+) {
const auto offset = block.offset();
// We use 64-bit offsets and 32-bit keys because we use a layout where each row-segment has its own arrow array.
// By default, the row-segments are 100k rows, so number of rows wouldn't exceed 32-bit ints.
@@ -119,7 +120,7 @@ sparrow::array string_dict_from_block(
// We use `int32_t` dictionary keys because pyarrow doesn't work with unsigned dictionary keys:
// https://github.com/pola-rs/polars/issues/10977
const auto block_size = block.row_count();
- sparrow::u8_buffer dict_keys_buffer{reinterpret_cast(block.release()), block_size};
+ sparrow::u8_buffer dict_keys_buffer{reinterpret_cast(block.release()), block_size};
const bool has_offset_buffer = column.has_extra_buffer(offset, ExtraBufferType::OFFSET);
const bool has_string_buffer = column.has_extra_buffer(offset, ExtraBufferType::STRING);
@@ -127,22 +128,25 @@ sparrow::array string_dict_from_block(
if (has_offset_buffer && has_string_buffer) {
auto& string_offsets = column.get_extra_buffer(offset, ExtraBufferType::OFFSET);
const auto offset_buffer_value_count = string_offsets.block(0)->bytes() / sizeof(int64_t);
- sparrow::u8_buffer offsets_buffer(reinterpret_cast(string_offsets.block(0)->release()), offset_buffer_value_count);
+ sparrow::u8_buffer offsets_buffer(
+ reinterpret_cast(string_offsets.block(0)->release()), offset_buffer_value_count
+ );
auto& strings = column.get_extra_buffer(offset, ExtraBufferType::STRING);
const auto strings_buffer_size = strings.block(0)->bytes();
- sparrow::u8_buffer strings_buffer(reinterpret_cast(strings.block(0)->release()), strings_buffer_size);
+ sparrow::u8_buffer strings_buffer(
+ reinterpret_cast(strings.block(0)->release()), strings_buffer_size
+ );
return {std::move(strings_buffer), std::move(offsets_buffer)};
} else if (!has_offset_buffer && !has_string_buffer) {
return minimal_strings_dict();
} else {
- util::raise_rte("Arrow output string creation expected either both or neither of OFFSET and STRING buffers to be present");
+ util::raise_rte("Arrow output string creation expected either both or neither of OFFSET and STRING buffers "
+ "to be present");
}
}();
auto dict_encoded = create_dict_array(
- sparrow::array{std::move(dict_values_array)},
- std::move(dict_keys_buffer),
- std::move(maybe_bitmap)
+ sparrow::array{std::move(dict_values_array)}, std::move(dict_keys_buffer), std::move(maybe_bitmap)
);
sparrow::array arr{std::move(dict_encoded)};
@@ -150,14 +154,13 @@ sparrow::array string_dict_from_block(
return arr;
}
-template
+template
sparrow::array arrow_array_from_block(
- TypedBlockData& block,
- std::string_view name,
- std::optional&& maybe_bitmap) {
+ TypedBlockData& block, std::string_view name, std::optional&& maybe_bitmap
+) {
using DataTagType = typename TagType::DataTypeTag;
using RawType = typename DataTagType::raw_type;
- auto *data_ptr = block.release();
+ auto* data_ptr = block.release();
const auto data_size = block.row_count();
auto arr = [&]() {
if constexpr (is_time_type(TagType::DataTypeTag::data_type)) {
@@ -172,4 +175,4 @@ sparrow::array arrow_array_from_block(
return arr;
}
-}
\ No newline at end of file
+} // namespace arcticdb
\ No newline at end of file
diff --git a/cpp/arcticdb/arrow/arrow_handlers.cpp b/cpp/arcticdb/arrow/arrow_handlers.cpp
index 669f959544..cacbc5b462 100644
--- a/cpp/arcticdb/arrow/arrow_handlers.cpp
+++ b/cpp/arcticdb/arrow/arrow_handlers.cpp
@@ -2,7 +2,8 @@
*
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
*
- * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
+ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
+ * will be governed by the Apache License, version 2.0.
*/
#include
#include
@@ -14,46 +15,41 @@
namespace arcticdb {
void ArrowStringHandler::handle_type(
- const uint8_t *&data,
- Column& dest_column,
- const EncodedFieldImpl &field,
- const ColumnMapping& m,
- const DecodePathData& shared_data,
- std::any& handler_data,
- EncodingVersion encoding_version,
- const std::shared_ptr& string_pool) {
+ const uint8_t*& data, Column& dest_column, const EncodedFieldImpl& field, const ColumnMapping& m,
+ const DecodePathData& shared_data, std::any& handler_data, EncodingVersion encoding_version,
+ const std::shared_ptr& string_pool
+) {
ARCTICDB_SAMPLE(ArrowHandleString, 0)
util::check(field.has_ndarray(), "String handler expected array");
schema::check(
m.source_type_desc_.data_type() == DataType::UTF_DYNAMIC64,
"Cannot read column '{}' into Arrow output format as it is of unsupported type {} (only {} is supported)",
- m.frame_field_descriptor_.name(), m.source_type_desc_.data_type(), DataType::UTF_DYNAMIC64);
+ m.frame_field_descriptor_.name(),
+ m.source_type_desc_.data_type(),
+ DataType::UTF_DYNAMIC64
+ );
ARCTICDB_DEBUG(log::version(), "String handler got encoded field: {}", field.DebugString());
- const auto &ndarray = field.ndarray();
+ const auto& ndarray = field.ndarray();
const auto bytes = encoding_sizes::data_uncompressed_size(ndarray);
- Column decoded_data{m.source_type_desc_, bytes / get_type_size(m.source_type_desc_.data_type()),
- AllocationType::DYNAMIC, Sparsity::PERMITTED};
-
+ Column decoded_data{
+ m.source_type_desc_,
+ bytes / get_type_size(m.source_type_desc_.data_type()),
+ AllocationType::DYNAMIC,
+ Sparsity::PERMITTED
+ };
- data += decode_field(m.source_type_desc_, field, data, decoded_data, decoded_data.opt_sparse_map(), encoding_version);
+ data += decode_field(
+ m.source_type_desc_, field, data, decoded_data, decoded_data.opt_sparse_map(), encoding_version
+ );
- convert_type(
- decoded_data,
- dest_column,
- m,
- shared_data,
- handler_data,
- string_pool);
+ convert_type(decoded_data, dest_column, m, shared_data, handler_data, string_pool);
}
void ArrowStringHandler::convert_type(
- const Column& source_column,
- Column& dest_column,
- const ColumnMapping& mapping,
- const DecodePathData&,
- std::any&,
- const std::shared_ptr& string_pool) const {
+ const Column& source_column, Column& dest_column, const ColumnMapping& mapping, const DecodePathData&,
+ std::any&, const std::shared_ptr& string_pool
+) const {
using ArcticStringColumnTag = ScalarTagType>;
auto input_data = source_column.data();
struct DictEntry {
@@ -69,7 +65,9 @@ void ArrowStringHandler::convert_type(
unique_offsets.reserve(source_column.row_count());
int64_t bytes = 0;
int32_t unique_offset_count = 0;
- auto dest_ptr = reinterpret_cast(dest_column.bytes_at(mapping.offset_bytes_, source_column.row_count() * sizeof(int32_t)));
+ auto dest_ptr = reinterpret_cast(
+ dest_column.bytes_at(mapping.offset_bytes_, source_column.row_count() * sizeof(int32_t))
+ );
util::BitSet bitset;
util::BitSet::bulk_insert_iterator inserter(bitset);
@@ -78,7 +76,12 @@ void ArrowStringHandler::convert_type(
// TODO: This can't be right if the column was sparse as it has only been decoded, not expanded
for (auto en = input_data.cbegin(); en != end; ++en) {
if (is_a_string(en->value())) {
- auto [entry, is_emplaced] = unique_offsets.try_emplace(en->value(), DictEntry{static_cast(unique_offset_count), bytes, string_pool->get_const_view(en->value())});
+ auto [entry, is_emplaced] = unique_offsets.try_emplace(
+ en->value(),
+ DictEntry{
+ static_cast(unique_offset_count), bytes, string_pool->get_const_view(en->value())
+ }
+ );
if (is_emplaced) {
bytes += entry->second.strv.size();
unique_offsets_in_order.push_back(en->value());
@@ -102,14 +105,22 @@ void ArrowStringHandler::convert_type(
create_dense_bitmap(mapping.offset_bytes_, bitset, dest_column, AllocationType::DETACHABLE);
} // else there weren't any Nones or NaNs
// bitset.count() == 0 is the special case where all of the rows contained None or NaN. In this case, do not create
- // the extra string and offset buffers. string_dict_from_block will then do the right thing and call minimal_strings_dict
+ // the extra string and offset buffers. string_dict_from_block will then do the right thing and call
+ // minimal_strings_dict
if (bitset.count() > 0) {
- auto& string_buffer = dest_column.create_extra_buffer(mapping.offset_bytes_, ExtraBufferType::STRING, bytes, AllocationType::DETACHABLE);
- auto& offsets_buffer = dest_column.create_extra_buffer(mapping.offset_bytes_, ExtraBufferType::OFFSET, (unique_offsets_in_order.size() + 1) * sizeof(int64_t), AllocationType::DETACHABLE);
+ auto& string_buffer = dest_column.create_extra_buffer(
+ mapping.offset_bytes_, ExtraBufferType::STRING, bytes, AllocationType::DETACHABLE
+ );
+ auto& offsets_buffer = dest_column.create_extra_buffer(
+ mapping.offset_bytes_,
+ ExtraBufferType::OFFSET,
+ (unique_offsets_in_order.size() + 1) * sizeof(int64_t),
+ AllocationType::DETACHABLE
+ );
// Then go through unique_offsets to fill up the offset and string buffers.
auto offsets_ptr = reinterpret_cast(offsets_buffer.data());
auto string_ptr = reinterpret_cast(string_buffer.data());
- for (auto unique_offset: unique_offsets_in_order) {
+ for (auto unique_offset : unique_offsets_in_order) {
const auto& entry = unique_offsets[unique_offset];
*offsets_ptr++ = entry.string_buffer_pos_;
memcpy(string_ptr, entry.strv.data(), entry.strv.size());
@@ -123,17 +134,11 @@ TypeDescriptor ArrowStringHandler::output_type(const TypeDescriptor&) const {
return make_scalar_type(DataType::UTF_DYNAMIC32);
}
-int ArrowStringHandler::type_size() const {
- return sizeof(uint32_t);
-}
+int ArrowStringHandler::type_size() const { return sizeof(uint32_t); }
void ArrowStringHandler::default_initialize(
- ChunkedBuffer& /*buffer*/,
- size_t /*offset*/,
- size_t /*byte_size*/,
- const DecodePathData& /*shared_data*/,
- std::any& /*handler_data*/) const {
-
-}
+ ChunkedBuffer& /*buffer*/, size_t /*offset*/, size_t /*byte_size*/, const DecodePathData& /*shared_data*/,
+ std::any& /*handler_data*/
+) const {}
} // namespace arcticdb
\ No newline at end of file
diff --git a/cpp/arcticdb/arrow/arrow_handlers.hpp b/cpp/arcticdb/arrow/arrow_handlers.hpp
index 4a2a840d14..8432e9a4d4 100644
--- a/cpp/arcticdb/arrow/arrow_handlers.hpp
+++ b/cpp/arcticdb/arrow/arrow_handlers.hpp
@@ -1,9 +1,10 @@
/* Copyright 2025 Man Group Operations Limited
-*
-* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
-*
-* As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
-*/
+ *
+ * Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
+ *
+ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
+ * will be governed by the Apache License, version 2.0.
+ */
#pragma once
#include
@@ -13,54 +14,47 @@ namespace arcticdb {
struct ArrowStringHandler {
void handle_type(
- const uint8_t*& data,
- Column& dest_column,
- const EncodedFieldImpl &field,
- const ColumnMapping& m,
- const DecodePathData& shared_data,
- std::any& handler_data,
- EncodingVersion encoding_version,
- const std::shared_ptr& string_pool
+ const uint8_t*& data, Column& dest_column, const EncodedFieldImpl& field, const ColumnMapping& m,
+ const DecodePathData& shared_data, std::any& handler_data, EncodingVersion encoding_version,
+ const std::shared_ptr& string_pool
);
[[nodiscard]] int type_size() const;
void convert_type(
- const Column& source_column,
- Column& dest_column,
- const ColumnMapping& mapping,
- const DecodePathData& shared_data,
- std::any& handler_data,
- const std::shared_ptr& string_pool) const;
+ const Column& source_column, Column& dest_column, const ColumnMapping& mapping,
+ const DecodePathData& shared_data, std::any& handler_data, const std::shared_ptr& string_pool
+ ) const;
[[nodiscard]] entity::TypeDescriptor output_type(const entity::TypeDescriptor& input_type) const;
void default_initialize(
- ChunkedBuffer& buffer,
- size_t offset,
- size_t byte_size,
- const DecodePathData& shared_data,
- std::any& handler_data) const;
+ ChunkedBuffer& buffer, size_t offset, size_t byte_size, const DecodePathData& shared_data,
+ std::any& handler_data
+ ) const;
};
-struct ArrowHandlerDataFactory : public TypeHandlerDataFactory {
- std::any get_data() const override {
- return {};
- }
+struct ArrowHandlerDataFactory : public TypeHandlerDataFactory {
+ std::any get_data() const override { return {}; }
};
inline void register_arrow_handler_data_factory() {
TypeHandlerRegistry::instance()->set_handler_data(OutputFormat::ARROW, std::make_unique());
}
-
inline void register_arrow_string_types() {
using namespace arcticdb;
constexpr std::array dynamic_string_data_types = {
- entity::DataType::ASCII_DYNAMIC64, entity::DataType::UTF_DYNAMIC64, entity::DataType::ASCII_FIXED64, entity::DataType::UTF_FIXED64};
+ entity::DataType::ASCII_DYNAMIC64,
+ entity::DataType::UTF_DYNAMIC64,
+ entity::DataType::ASCII_FIXED64,
+ entity::DataType::UTF_FIXED64
+ };
for (auto data_type : dynamic_string_data_types) {
- TypeHandlerRegistry::instance()->register_handler(OutputFormat::ARROW, make_scalar_type(data_type), arcticdb::ArrowStringHandler{});
+ TypeHandlerRegistry::instance()->register_handler(
+ OutputFormat::ARROW, make_scalar_type(data_type), arcticdb::ArrowStringHandler{}
+ );
}
}
diff --git a/cpp/arcticdb/arrow/arrow_output_frame.cpp b/cpp/arcticdb/arrow/arrow_output_frame.cpp
index 001fa1015f..c9470318df 100644
--- a/cpp/arcticdb/arrow/arrow_output_frame.cpp
+++ b/cpp/arcticdb/arrow/arrow_output_frame.cpp
@@ -1,10 +1,10 @@
/* Copyright 2025 Man Group Operations Limited
-*
-* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
-*
-* As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
-*/
-
+ *
+ * Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
+ *
+ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
+ * will be governed by the Apache License, version 2.0.
+ */
#include
@@ -12,12 +12,11 @@
namespace arcticdb {
-ArrowOutputFrame::ArrowOutputFrame(
- std::shared_ptr>&& data) :
+ArrowOutputFrame::ArrowOutputFrame(std::shared_ptr>&& data) :
data_(std::move(data)) {}
size_t ArrowOutputFrame::num_blocks() const {
- if(!data_ || data_->empty())
+ if (!data_ || data_->empty())
return 0;
return data_->size();
@@ -30,7 +29,7 @@ std::vector ArrowOutputFrame::extract_record_batches() {
}
output.reserve(data_->size());
- for(auto& batch : *data_) {
+ for (auto& batch : *data_) {
auto struct_array = sparrow::array{batch.extract_struct_array()};
auto [arr, schema] = sparrow::extract_arrow_structures(std::move(struct_array));
@@ -40,4 +39,4 @@ std::vector ArrowOutputFrame::extract_record_batches() {
return output;
}
-} // namespace arcticdb
\ No newline at end of file
+} // namespace arcticdb
\ No newline at end of file
diff --git a/cpp/arcticdb/arrow/arrow_output_frame.hpp b/cpp/arcticdb/arrow/arrow_output_frame.hpp
index e5b4508bb9..239fbe3d43 100644
--- a/cpp/arcticdb/arrow/arrow_output_frame.hpp
+++ b/cpp/arcticdb/arrow/arrow_output_frame.hpp
@@ -2,7 +2,8 @@
*
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
*
- * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
+ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
+ * will be governed by the Apache License, version 2.0.
*/
#pragma once
@@ -14,28 +15,20 @@ namespace arcticdb {
// C arrow representation of a record batch. Can be converted to a pyarrow.RecordBatch zero copy.
struct RecordBatchData {
- RecordBatchData(ArrowArray array, ArrowSchema schema) :
- array_(array),
- schema_(schema) {
- }
+ RecordBatchData(ArrowArray array, ArrowSchema schema) : array_(array), schema_(schema) {}
ArrowArray array_;
ArrowSchema schema_;
- uintptr_t array() {
- return reinterpret_cast(&array_);
- }
+ uintptr_t array() { return reinterpret_cast(&array_); }
- uintptr_t schema() {
- return reinterpret_cast(&schema_);
- }
+ uintptr_t schema() { return reinterpret_cast(&schema_); }
};
struct ArrowOutputFrame {
ArrowOutputFrame() = default;
- ArrowOutputFrame(
- std::shared_ptr>&& data);
+ ArrowOutputFrame(std::shared_ptr>&& data);
std::shared_ptr> data_;
diff --git a/cpp/arcticdb/arrow/arrow_utils.cpp b/cpp/arcticdb/arrow/arrow_utils.cpp
index cd5c4fc04e..863540abae 100644
--- a/cpp/arcticdb/arrow/arrow_utils.cpp
+++ b/cpp/arcticdb/arrow/arrow_utils.cpp
@@ -2,7 +2,8 @@
*
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
*
- * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
+ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
+ * will be governed by the Apache License, version 2.0.
*/
#include
@@ -14,7 +15,7 @@
namespace arcticdb {
sparrow::array empty_arrow_array_from_type(const TypeDescriptor& type, std::string_view name) {
- auto res = type.visit_tag([](auto &&impl) {
+ auto res = type.visit_tag([](auto&& impl) {
using TagType = std::decay_t;
using DataTagType = typename TagType::DataTypeTag;
using RawType = typename DataTagType::raw_type;
@@ -22,8 +23,7 @@ sparrow::array empty_arrow_array_from_type(const TypeDescriptor& type, std::stri
if constexpr (is_sequence_type(TagType::DataTypeTag::data_type)) {
sparrow::u8_buffer dict_keys_buffer{nullptr, 0};
auto dict_values_array = minimal_strings_dict();
- return sparrow::array{
- create_dict_array(
+ return sparrow::array{create_dict_array(
sparrow::array{std::move(dict_values_array)},
std::move(dict_keys_buffer),
std::move(validity_bitmap)
@@ -42,7 +42,7 @@ std::vector arrow_arrays_from_column(const Column& column, std::
std::vector vec;
auto column_data = column.data();
vec.reserve(column.num_blocks());
- column.type().visit_tag([&vec, &column_data, &column, name](auto &&impl) {
+ column.type().visit_tag([&vec, &column_data, &column, name](auto&& impl) {
using TagType = std::decay_t;
if (column_data.num_blocks() == 0) {
// For empty columns we want to return one empty array instead of no arrays.
@@ -68,18 +68,31 @@ std::shared_ptr> segment_to_arrow_data(Segmen
// column_blocks == 0 is a special case where we are returning a zero-row structure (e.g. if date_range is
// provided outside of the time range covered by the symbol)
- auto output = std::make_shared>(column_blocks == 0 ? 1 : column_blocks, sparrow::record_batch{});
+ auto output = std::make_shared>(
+ column_blocks == 0 ? 1 : column_blocks, sparrow::record_batch{}
+ );
for (auto i = 0UL; i < num_columns; ++i) {
auto& column = segment.column(static_cast(i));
- util::check(column.num_blocks() == column_blocks, "Non-standard column block number: {} != {}", column.num_blocks(), column_blocks);
+ util::check(
+ column.num_blocks() == column_blocks,
+ "Non-standard column block number: {} != {}",
+ column.num_blocks(),
+ column_blocks
+ );
auto column_arrays = arrow_arrays_from_column(column, segment.field(i).name());
- util::check(column_arrays.size() == output->size(), "Unexpected number of arrow arrays returned: {} != {}", column_arrays.size(), output->size());
+ util::check(
+ column_arrays.size() == output->size(),
+ "Unexpected number of arrow arrays returned: {} != {}",
+ column_arrays.size(),
+ output->size()
+ );
for (auto block_idx = 0UL; block_idx < column_arrays.size(); ++block_idx) {
util::check(block_idx < output->size(), "Block index overflow {} > {}", block_idx, output->size());
- (*output)[block_idx].add_column(static_cast(segment.field(i).name()),
- std::move(column_arrays[block_idx]));
+ (*output)[block_idx].add_column(
+ static_cast(segment.field(i).name()), std::move(column_arrays[block_idx])
+ );
}
}
return output;
diff --git a/cpp/arcticdb/arrow/arrow_utils.hpp b/cpp/arcticdb/arrow/arrow_utils.hpp
index 8686c53964..3beb595aac 100644
--- a/cpp/arcticdb/arrow/arrow_utils.hpp
+++ b/cpp/arcticdb/arrow/arrow_utils.hpp
@@ -2,7 +2,8 @@
*
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
*
- * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
+ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
+ * will be governed by the Apache License, version 2.0.
*/
#pragma once
diff --git a/cpp/arcticdb/arrow/test/test_arrow.cpp b/cpp/arcticdb/arrow/test/test_arrow.cpp
index c3cf31d98a..82636a45d2 100644
--- a/cpp/arcticdb/arrow/test/test_arrow.cpp
+++ b/cpp/arcticdb/arrow/test/test_arrow.cpp
@@ -2,7 +2,8 @@
*
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
*
- * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
+ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
+ * will be governed by the Apache License, version 2.0.
*/
#include
@@ -17,9 +18,11 @@
using namespace arcticdb;
template
-void allocate_and_fill_chunked_column(Column& column, size_t num_rows, size_t chunk_size, std::optional> values = std::nullopt) {
+void allocate_and_fill_chunked_column(
+ Column& column, size_t num_rows, size_t chunk_size, std::optional> values = std::nullopt
+) {
// Allocate column in chunks
- for (size_t row = 0; row < num_rows; row+=chunk_size) {
+ for (size_t row = 0; row < num_rows; row += chunk_size) {
auto data_size = data_type_size(column.type(), OutputFormat::ARROW, DataTypeMode::EXTERNAL);
auto current_block_size = std::min(chunk_size, num_rows - row);
auto bytes = current_block_size * data_size;
@@ -37,13 +40,17 @@ void allocate_and_fill_chunked_column(Column& column, size_t num_rows, size_t ch
}
}
-SegmentInMemory get_detachable_segment(StreamId symbol, std::span fields, size_t num_rows, size_t chunk_size) {
+SegmentInMemory get_detachable_segment(
+ StreamId symbol, std::span fields, size_t num_rows, size_t chunk_size
+) {
auto num_columns = fields.size();
- SegmentInMemory segment(get_test_descriptor(symbol, fields), 0, AllocationType::DETACHABLE);
+ SegmentInMemory segment(
+ get_test_descriptor(symbol, fields), 0, AllocationType::DETACHABLE
+ );
- for (auto i=0u; i < num_columns+1; ++i) {
+ for (auto i = 0u; i < num_columns + 1; ++i) {
auto& column = segment.column(i);
- column.type().visit_tag([&column, &num_rows, &chunk_size](auto &&impl) {
+ column.type().visit_tag([&column, &num_rows, &chunk_size](auto&& impl) {
using TagType = std::decay_t;
using RawType = typename TagType::DataTypeTag::raw_type;
allocate_and_fill_chunked_column(column, num_rows, chunk_size);
@@ -54,7 +61,10 @@ SegmentInMemory get_detachable_segment(StreamId symbol, std::span string_pool, const std::vector& values) {
+void fill_chunked_string_column(
+ Column& column, size_t num_rows, size_t chunk_size, std::shared_ptr string_pool,
+ const std::vector& values
+) {
auto num_chunks = num_rows / chunk_size + (num_rows % chunk_size != 0);
std::vector string_pool_offsets;
@@ -66,28 +76,29 @@ void fill_chunked_string_column(Column& column, size_t num_rows, size_t chunk_si
auto handler = ArrowStringHandler();
auto source_type_desc = TypeDescriptor{DataType::UTF_DYNAMIC64, Dimension::Dim0};
auto dest_type_desc = TypeDescriptor{DataType::UTF_DYNAMIC32, Dimension::Dim0};
- for (auto chunk=0u; chunk(column, num_rows, chunk_size);
auto arrow_arrays = arrow_arrays_from_column(column, "col");
EXPECT_EQ(arrow_arrays.size(), num_chunks);
for (const auto& arr : arrow_arrays) {
EXPECT_EQ(arr.name(), "col");
}
- for (auto row=0u; row < num_rows; ++row) {
+ for (auto row = 0u; row < num_rows; ++row) {
auto chunk = row / chunk_size;
auto pos = row % chunk_size;
EXPECT_EQ(std::get>(arrow_arrays[chunk][pos]).get(), static_cast(row));
@@ -119,7 +132,7 @@ TEST(Arrow, ColumnString) {
std::vector column_values;
column_values.reserve(num_rows);
- for (auto i=0u; i < num_rows; ++i) {
+ for (auto i = 0u; i < num_rows; ++i) {
column_values.push_back(strings[i % strings.size()]);
}
@@ -131,7 +144,7 @@ TEST(Arrow, ColumnString) {
fill_chunked_string_column(column, num_rows, chunk_size, pool, column_values);
// Verify applying the string handler sets the correct external buffers
- for (auto chunk=0u; chunk(global_row);
auto offset_begin = offset_buffer.cast(id);
- auto str_size = offset_buffer.cast(id+1) - offset_begin;
- auto str_in_column = std::string_view(
- reinterpret_cast(string_buffer.bytes_at(offset_begin, str_size)),
- str_size);
+ auto str_size = offset_buffer.cast(id + 1) - offset_begin;
+ auto str_in_column =
+ std::string_view(reinterpret_cast(string_buffer.bytes_at(offset_begin, str_size)), str_size);
EXPECT_EQ(str_in_column, column_values[global_row]);
}
}
@@ -163,7 +175,7 @@ TEST(Arrow, ColumnString) {
for (const auto& arr : arrow_arrays) {
EXPECT_EQ(arr.name(), "col");
}
- for (auto row=0u; row < num_rows; ++row) {
+ for (auto row = 0u; row < num_rows; ++row) {
auto chunk = row / chunk_size;
auto pos = row % chunk_size;
auto value = arrow_arrays[chunk][pos];
@@ -178,9 +190,9 @@ TEST(Arrow, ConvertSegmentBasic) {
const auto chunk_size = 10u;
const auto num_chunks = num_rows / chunk_size;
const auto fields = std::array{
- scalar_field(DataType::UINT8, "smallints"),
- scalar_field(DataType::INT64, "bigints"),
- scalar_field(DataType::FLOAT64, "floats"),
+ scalar_field(DataType::UINT8, "smallints"),
+ scalar_field(DataType::INT64, "bigints"),
+ scalar_field(DataType::FLOAT64, "floats"),
};
auto segment = get_detachable_segment(symbol, fields, num_rows, chunk_size);
// Verify the index column has the expected number of chunks
@@ -224,9 +236,9 @@ TEST(Arrow, ConvertSegmentMultipleStringColumns) {
const auto chunk_size = 19u;
const auto num_chunks = num_rows / chunk_size + (num_rows % chunk_size != 0);
const auto fields = std::array{
- scalar_field(DataType::FLOAT64, "floats"),
- scalar_field(DataType::UTF_DYNAMIC32, "str_1"),
- scalar_field(DataType::UTF_DYNAMIC32, "str_2"),
+ scalar_field(DataType::FLOAT64, "floats"),
+ scalar_field(DataType::UTF_DYNAMIC32, "str_1"),
+ scalar_field(DataType::UTF_DYNAMIC32, "str_2"),
};
// We populate string columns so they have 30 different and 70 common strings.
const auto str_id_offset = 30u;
@@ -244,8 +256,8 @@ TEST(Arrow, ConvertSegmentMultipleStringColumns) {
// Convert to arrow
auto arrow_data = segment_to_arrow_data(segment);
EXPECT_EQ(arrow_data->size(), num_chunks);
- for (auto i=0u; i < num_chunks; ++i) {
- auto row_count = std::min(chunk_size, num_rows - i*chunk_size);
+ for (auto i = 0u; i < num_chunks; ++i) {
+ auto row_count = std::min(chunk_size, num_rows - i * chunk_size);
const auto& record_batch = (*arrow_data)[i];
auto names = record_batch.names();
auto columns = record_batch.columns();
@@ -258,10 +270,14 @@ TEST(Arrow, ConvertSegmentMultipleStringColumns) {
EXPECT_EQ(columns[1].data_type(), sparrow::data_type::DOUBLE);
EXPECT_EQ(names[2], "str_1");
EXPECT_EQ(columns[2].data_type(), sparrow::data_type::INT32); // The dict array keys are INT32s
- assert_arrow_string_array_as_expected(columns[2], std::span(string_values[0]).subspan(i*chunk_size, row_count));
+ assert_arrow_string_array_as_expected(
+ columns[2], std::span(string_values[0]).subspan(i * chunk_size, row_count)
+ );
EXPECT_EQ(names[3], "str_2");
EXPECT_EQ(columns[3].data_type(), sparrow::data_type::INT32); // The dict array keys are INT32s
- assert_arrow_string_array_as_expected(columns[3], std::span(string_values[1]).subspan(i*chunk_size, row_count));
+ assert_arrow_string_array_as_expected(
+ columns[3], std::span(string_values[1]).subspan(i * chunk_size, row_count)
+ );
for (const auto& col : columns) {
EXPECT_EQ(col.size(), row_count);
}
diff --git a/cpp/arcticdb/async/async_store.cpp b/cpp/arcticdb/async/async_store.cpp
index c1cf4fdbbd..44390d4ac9 100644
--- a/cpp/arcticdb/async/async_store.cpp
+++ b/cpp/arcticdb/async/async_store.cpp
@@ -2,7 +2,8 @@
*
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
*
- * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
+ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
+ * will be governed by the Apache License, version 2.0.
*/
#include
@@ -11,19 +12,15 @@
namespace arcticdb::async {
DeDupLookupResult lookup_match_in_dedup_map(
- const std::shared_ptr &de_dup_map,
- storage::KeySegmentPair& key_seg) {
+ const std::shared_ptr& de_dup_map, storage::KeySegmentPair& key_seg
+) {
std::optional de_dup_key;
if (!de_dup_map || !(de_dup_key = de_dup_map->get_key_if_present(key_seg.atom_key()))) {
- ARCTICDB_DEBUG(log::version(),
- "No existing key with same contents: writing new object {}",
- key_seg.atom_key());
+ ARCTICDB_DEBUG(log::version(), "No existing key with same contents: writing new object {}", key_seg.atom_key());
return key_seg;
} else {
- ARCTICDB_DEBUG(log::version(),
- "Found existing key with same contents: using existing object {}",
- *de_dup_key);
+ ARCTICDB_DEBUG(log::version(), "Found existing key with same contents: using existing object {}", *de_dup_key);
return *de_dup_key;
}
}
-}
+} // namespace arcticdb::async
diff --git a/cpp/arcticdb/async/async_store.hpp b/cpp/arcticdb/async/async_store.hpp
index 496d93cd0d..44cedee7a3 100644
--- a/cpp/arcticdb/async/async_store.hpp
+++ b/cpp/arcticdb/async/async_store.hpp
@@ -2,7 +2,8 @@
*
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
*
- * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
+ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software
+ * will be governed by the Apache License, version 2.0.
*/
#pragma once
@@ -16,8 +17,8 @@
#include
#include
-namespace arcticdb::toolbox::apy{
- class LibraryTool;
+namespace arcticdb::toolbox::apy {
+class LibraryTool;
}
namespace arcticdb::async {
@@ -27,17 +28,20 @@ using NewObject = storage::KeySegmentPair;
using DeDupLookupResult = std::variant;
DeDupLookupResult lookup_match_in_dedup_map(
- const std::shared_ptr &de_dup_map,
- storage::KeySegmentPair& key_seg);
+ const std::shared_ptr& de_dup_map, storage::KeySegmentPair& key_seg
+);
-template
-auto read_and_continue(const VariantKey& key, std::shared_ptr library, const storage::ReadKeyOpts& opts, Callable&& c) {
+template
+auto read_and_continue(
+ const VariantKey& key, std::shared_ptr library, const storage::ReadKeyOpts& opts, Callable&& c
+) {
return async::submit_io_task(ReadCompressedTask{key, library, opts, std::forward(c)})
- .thenValueInline([](auto &&result) mutable {
- auto&& [key_seg_fut, continuation] = std::forward(result);
- return std::move(key_seg_fut).thenValueInline([continuation=std::move(continuation)] (storage::KeySegmentPair&& key_seg) mutable { return continuation(std::move(key_seg)); });
- }
- );
+ .thenValueInline([](auto&& result) mutable {
+ auto&& [key_seg_fut, continuation] = std::forward(result);
+ return std::move(key_seg_fut)
+ .thenValueInline([continuation = std::move(continuation)](storage::KeySegmentPair&& key_seg
+ ) mutable { return continuation(std::move(key_seg)); });
+ });
}
/*
@@ -49,413 +53,412 @@ auto read_and_continue(const VariantKey& key, std::shared_ptr
*/
template
class AsyncStore : public Store {
-public:
+ public:
AsyncStore(
- std::shared_ptr library,
- const proto::encoding::VariantCodec &codec,
- EncodingVersion encoding_version
+ std::shared_ptr library, const proto::encoding::VariantCodec& codec,
+ EncodingVersion encoding_version
) :
library_(std::move(library)),
codec_(std::make_shared(codec)),
- encoding_version_(encoding_version) {
+ encoding_version_(encoding_version) {}
+
+ folly::Future write(
+ stream::KeyType key_type, VersionId version_id, const StreamId& stream_id, IndexValue start_index,
+ IndexValue end_index, SegmentInMemory&& segment
+ ) override {
+
+ util::check(
+ segment.descriptor().id() == stream_id,
+ "Descriptor id mismatch in atom key {} != {}",
+ stream_id,
+ segment.descriptor().id()
+ );
+
+ return async::submit_cpu_task(EncodeAtomTask{
+ key_type,
+ version_id,
+ stream_id,
+ start_index,
+ end_index,
+ current_timestamp(),
+ std::move(segment),
+ codec_,
+ encoding_version_
+ })
+ .via(&async::io_executor())
+ .thenValue(WriteSegmentTask{library_});
}
folly::Future write(
- stream::KeyType key_type,
- VersionId version_id,
- const StreamId &stream_id,
- IndexValue start_index,
- IndexValue end_index,
- SegmentInMemory &&segment) override {
+ stream::KeyType key_type, VersionId version_id, const StreamId& stream_id, timestamp creation_ts,
+ IndexValue start_index, IndexValue end_index, SegmentInMemory&& segment
+ ) override {
- util::check(segment.descriptor().id() == stream_id, "Descriptor id mismatch in atom key {} != {}", stream_id, segment.descriptor().id());
+ util::check(
+ segment.descriptor().id() == stream_id,
+ "Descriptor id mismatch in atom key {} != {}",
+ stream_id,
+ segment.descriptor().id()
+ );
return async::submit_cpu_task(EncodeAtomTask{
- key_type, version_id, stream_id, start_index, end_index, current_timestamp(),
- std::move(segment), codec_, encoding_version_
- }).via(&async::io_executor())
- .thenValue(WriteSegmentTask{library_});
- }
-
-folly::Future write(
- stream::KeyType key_type,
- VersionId version_id,
- const StreamId &stream_id,
- timestamp creation_ts,
- IndexValue start_index,
- IndexValue end_index,
- SegmentInMemory &&segment) override {
-
- util::check(segment.descriptor().id() == stream_id, "Descriptor id mismatch in atom key {} != {}", stream_id, segment.descriptor().id());
-
- return async::submit_cpu_task(EncodeAtomTask{
- key_type, version_id, stream_id, start_index, end_index, creation_ts,
- std::move(segment), codec_, encoding_version_
- })
- .via(&async::io_executor())
- .thenValue(WriteSegmentTask{library_});
-}
+ key_type,
+ version_id,
+ stream_id,
+ start_index,
+ end_index,
+ creation_ts,
+ std::move(segment),
+ codec_,
+ encoding_version_
+ })
+ .via(&async::io_executor())
+ .thenValue(WriteSegmentTask{library_});
+ }
-folly::Future write(PartialKey pk, SegmentInMemory &&segment) override {
- return write(pk.key_type, pk.version_id, pk.stream_id, pk.start_index, pk.end_index, std::move(segment));
-}
+ folly::Future write(PartialKey pk, SegmentInMemory&& segment) override {
+ return write(pk.key_type, pk.version_id, pk.stream_id, pk.start_index, pk.end_index, std::move(segment));
+ }
-folly::Future write(
- KeyType key_type,
- const StreamId &stream_id,
- SegmentInMemory &&segment) override {
- util::check(is_ref_key_class(key_type), "Expected ref key type got {}", key_type);
- return async::submit_cpu_task(EncodeRefTask{
- key_type, stream_id, std::move(segment), codec_, encoding_version_
- })
- .via(&async::io_executor())
- .thenValue(WriteSegmentTask{library_});
-}
+ folly::Future write(KeyType key_type, const StreamId& stream_id, SegmentInMemory&& segment)
+ override {
+ util::check(is_ref_key_class(key_type), "Expected ref key type got {}", key_type);
+ return async::submit_cpu_task(EncodeRefTask{key_type, stream_id, std::move(segment), codec_, encoding_version_})
+ .via(&async::io_executor())
+ .thenValue(WriteSegmentTask{library_});
+ }
-folly::Future write_maybe_blocking(PartialKey pk, SegmentInMemory &&segment, std::shared_ptr semaphore) override {
- log::version().debug("Waiting for semaphore for write_maybe_blocking {}", pk);
- semaphore->wait();
- log::version().debug("Starting write_maybe_blocking {}", pk);
- return write(pk.key_type, pk.version_id, pk.stream_id, pk.start_index, pk.end_index, std::move(segment))
- .thenTryInline([semaphore](folly::Try keyTry) {
- semaphore->post();
- keyTry.throwUnlessValue();
- return keyTry.value();
- });
-}
+ folly::Future write_maybe_blocking(
+ PartialKey pk, SegmentInMemory&& segment, std::shared_ptr semaphore
+ ) override {
+ log::version().debug("Waiting for semaphore for write_maybe_blocking {}", pk);
+ semaphore->wait();
+ log::version().debug("Starting write_maybe_blocking {}", pk);
+ return write(pk.key_type, pk.version_id, pk.stream_id, pk.start_index, pk.end_index, std::move(segment))
+ .thenTryInline([semaphore](folly::Try keyTry) {
+ semaphore->post();
+ keyTry.throwUnlessValue();
+ return keyTry.value();
+ });
+ }
-entity::VariantKey write_sync(
- stream::KeyType key_type,
- VersionId version_id,
- const StreamId &stream_id,
- IndexValue start_index,
- IndexValue end_index,
- SegmentInMemory &&segment) override {
+ entity::VariantKey write_sync(
+ stream::KeyType key_type, VersionId version_id, const StreamId& stream_id, IndexValue start_index,
+ IndexValue end_index, SegmentInMemory&& segment
+ ) override {
- util::check(segment.descriptor().id() == stream_id,
+ util::check(
+ segment.descriptor().id() == stream_id,
"Descriptor id mismatch in atom key {} != {}",
stream_id,
- segment.descriptor().id());
+ segment.descriptor().id()
+ );
- auto encoded = EncodeAtomTask{
- key_type, version_id, stream_id, start_index, end_index, current_timestamp(),
- std::move(segment), codec_, encoding_version_
- }();
- return WriteSegmentTask{library_}(std::move(encoded));
-}
+ auto encoded = EncodeAtomTask{
+ key_type,
+ version_id,
+ stream_id,
+ start_index,
+ end_index,
+ current_timestamp(),
+ std::move(segment),
+ codec_,
+ encoding_version_
+ }();
+ return WriteSegmentTask{library_}(std::move(encoded));
+ }
-entity::VariantKey write_sync(PartialKey pk, SegmentInMemory &&segment) override {
- return write_sync(pk.key_type, pk.version_id, pk.stream_id, pk.start_index, pk.end_index, std::move(segment));
-}
+ entity::VariantKey write_sync(PartialKey pk, SegmentInMemory&& segment) override {
+ return write_sync(pk.key_type, pk.version_id, pk.stream_id, pk.start_index, pk.end_index, std::move(segment));
+ }
-entity::VariantKey write_sync(
- KeyType key_type,
- const StreamId &stream_id,
- SegmentInMemory &&segment) override {
- util::check(is_ref_key_class(key_type), "Expected ref key type got {}", key_type);
- auto encoded = EncodeRefTask{key_type, stream_id, std::move(segment), codec_, encoding_version_}();
- return WriteSegmentTask{library_}(std::move(encoded));
-}
+ entity::VariantKey write_sync(KeyType key_type, const StreamId& stream_id, SegmentInMemory&& segment) override {
+ util::check(is_ref_key_class(key_type), "Expected ref key type got {}", key_type);
+ auto encoded = EncodeRefTask{key_type, stream_id, std::move(segment), codec_, encoding_version_}();
+ return WriteSegmentTask{library_}(std::move(encoded));
+ }
-entity::VariantKey write_if_none_sync(
- KeyType key_type,
- const StreamId &stream_id,
- SegmentInMemory &&segment) override {
- util::check(is_ref_key_class(key_type), "Expected ref key type got {}", key_type);
- auto encoded = EncodeRefTask{key_type, stream_id, std::move(segment), codec_, encoding_version_}();
- return WriteIfNoneTask{library_}(std::move(encoded));
-}
+ entity::VariantKey write_if_none_sync(KeyType key_type, const StreamId& stream_id, SegmentInMemory&& segment)
+ override {
+ util::check(is_ref_key_class(key_type), "Expected ref key type got {}", key_type);
+ auto encoded = EncodeRefTask{key_type, stream_id, std::move(segment), codec_, encoding_version_}();
+ return WriteIfNoneTask{library_}(std::move(encoded));
+ }
-bool is_path_valid(const std::string_view path) const override {
- return library_->is_path_valid(path);
-}
+ bool is_path_valid(const std::string_view path) const override { return library_->is_path_valid(path); }
-folly::Future write_compressed(storage::KeySegmentPair ks) override {
- return async::submit_io_task(WriteCompressedTask{std::move(ks), library_});
-}
+ folly::Future write_compressed(storage::KeySegmentPair ks) override {
+ return async::submit_io_task(WriteCompressedTask{std::move(ks), library_});
+ }
-void write_compressed_sync(storage::KeySegmentPair ks) override {
- library_->write(ks);
-}
+ void write_compressed_sync(storage::KeySegmentPair ks) override { library_->write(ks); }
-folly::Future update(const entity::VariantKey &key,
- SegmentInMemory &&segment,
- storage::UpdateOpts opts) override {
- auto stream_id = variant_key_id(key);
- util::check(segment.descriptor().id() == stream_id,
+ folly::Future update(
+ const entity::VariantKey& key, SegmentInMemory&& segment, storage::UpdateOpts opts
+ ) override {
+ auto stream_id = variant_key_id(key);
+ util::check(
+ segment.descriptor().id() == stream_id,
"Descriptor id mismatch in variant key {} != {}",
stream_id,
- segment.descriptor().id());
+ segment.descriptor().id()
+ );
- return async::submit_cpu_task(EncodeSegmentTask{
- key, std::move(segment), codec_, encoding_version_
- })
- .via(&async::io_executor())
- .thenValue(UpdateSegmentTask{library_, opts});
-}
+ return async::submit_cpu_task(EncodeSegmentTask{key, std::move(segment), codec_, encoding_version_})
+ .via(&async::io_executor())
+ .thenValue(UpdateSegmentTask{library_, opts});
+ }
-folly::Future copy(
- KeyType key_type,
- const StreamId &stream_id,
- VersionId version_id,
- const VariantKey &source_key) override {
- return async::submit_io_task(CopyCompressedTask{source_key, key_type, stream_id, version_id, library_});
-}
+ folly::Future copy(
+ KeyType key_type, const StreamId& stream_id, VersionId version_id, const VariantKey& source_key
+ ) override {
+ return async::submit_io_task(
+ CopyCompressedTask{source_key, key_type, stream_id, version_id, library_}
+ );
+ }
-VariantKey copy_sync(
- KeyType key_type,
- const StreamId &stream_id,
- VersionId version_id,
- const VariantKey &source_key) override {
- return CopyCompressedTask{source_key, key_type, stream_id, version_id, library_}();
-}
+ VariantKey copy_sync(
+ KeyType key_type, const StreamId& stream_id, VersionId version_id, const VariantKey& source_key
+ ) override {
+ return CopyCompressedTask{source_key, key_type, stream_id, version_id, library_}();
+ }
-timestamp current_timestamp() override {
- return ClockType::nanos_since_epoch();
-}
+ timestamp current_timestamp() override { return ClockType::nanos_since_epoch(); }
-void iterate_type(
- KeyType type,
- const entity::IterateTypeVisitor& func,
- const std::string &prefix) override {
- library_->iterate_type(type, func, prefix);
-}
+ void iterate_type(KeyType type, const entity::IterateTypeVisitor& func, const std::string& prefix) override {
+ library_->iterate_type(type, func, prefix);
+ }
-folly::Future visit_object_sizes(
- KeyType type, const std::optional& stream_id_opt, storage::ObjectSizesVisitor visitor) override {
- std::string prefix;
- if (stream_id_opt) {
- const auto& stream_id = *stream_id_opt;
- prefix = std::holds_alternative(stream_id) ? std::get(stream_id) : std::string();
- }
-
- if (library_->supports_object_size_calculation()) {
- // The library has native support for some kind of clever size calculation, so let it take over
- return async::submit_io_task(VisitObjectSizesTask{type, prefix, std::move(visitor), library_});
- }
-
- // No native support for a clever size calculation, so just read keys and sum their sizes
- KeySizeCalculators key_size_calculators;
- iterate_type(type, [&key_size_calculators, &stream_id_opt, &visitor](VariantKey&& k) {
- key_size_calculators.emplace_back(std::move(k), [visitor, stream_id_opt] (auto&& key_seg) {
- if (!stream_id_opt || variant_key_id(key_seg.variant_key()) == *stream_id_opt) {
- auto compressed_size = key_seg.segment().size();
- visitor(key_seg.variant_key(), compressed_size);
- }
- return std::forward(key_seg).variant_key();
- });
- }, prefix);
-
- read_ignoring_key_not_found(std::move(key_size_calculators));
- return folly::makeFuture();
-}
+ folly::Future visit_object_sizes(
+ KeyType type, const std::optional& stream_id_opt, storage::ObjectSizesVisitor visitor
+ ) override {
+ std::string prefix;
+ if (stream_id_opt) {
+ const auto& stream_id = *stream_id_opt;
+ prefix = std::holds_alternative(stream_id) ? std::get(stream_id) : std::string();
+ }
-folly::Future> get_object_sizes(KeyType type, const std::optional& stream_id_opt) override {
- auto counter = std::make_shared(0);
- auto bytes = std::make_shared(0);
- storage::ObjectSizesVisitor visitor = [counter, bytes](const VariantKey&, storage::CompressedSize size) {
- counter->fetch_add(1, std::memory_order_relaxed);
- bytes->fetch_add(size, std::memory_order_relaxed);
- };
-
- return visit_object_sizes(type, stream_id_opt, std::move(visitor))
- .thenValueInline([counter, bytes, type](folly::Unit&&) {
- return std::make_shared(type, *counter, *bytes);
- });
-}
+ if (library_->supports_object_size_calculation()) {
+ // The library has native support for some kind of clever size calculation, so let it take over
+ return async::submit_io_task(VisitObjectSizesTask{type, prefix, std::move(visitor), library_});
+ }
-bool scan_for_matching_key(
- KeyType key_type, const IterateTypePredicate& predicate) override {
- return library_->scan_for_matching_key(key_type, predicate);
-}
+ // No native support for a clever size calculation, so just read keys and sum their sizes
+ KeySizeCalculators key_size_calculators;
+ iterate_type(
+ type,
+ [&key_size_calculators, &stream_id_opt, &visitor](VariantKey&& k) {
+ key_size_calculators.emplace_back(std::move(k), [visitor, stream_id_opt](auto&& key_seg) {
+ if (!stream_id_opt || variant_key_id(key_seg.variant_key()) == *stream_id_opt) {
+ auto compressed_size = key_seg.segment().size();
+ visitor(key_seg.variant_key(), compressed_size);
+ }
+ return std::forward(key_seg).variant_key();
+ });
+ },
+ prefix
+ );
+
+ read_ignoring_key_not_found(std::move(key_size_calculators));
+ return folly::makeFuture();
+ }
-folly::Future> read(
- const entity::VariantKey &key,
- storage::ReadKeyOpts opts) override {
- return read_and_continue(key, library_, opts, DecodeSegmentTask{});
-}
+ folly::Future> get_object_sizes(
+ KeyType type, const std::optional& stream_id_opt
+ ) override {
+ auto counter = std::make_shared(0);
+ auto bytes = std::make_shared(0);
+ storage::ObjectSizesVisitor visitor = [counter, bytes](const VariantKey&, storage::CompressedSize size) {
+ counter->fetch_add(1, std::memory_order_relaxed);
+ bytes->fetch_add(size, std::memory_order_relaxed);
+ };
+
+ return visit_object_sizes(type, stream_id_opt, std::move(visitor))
+ .thenValueInline([counter, bytes, type](folly::Unit&&) {
+ return std::make_shared(type, *counter, *bytes);
+ });
+ }
-std::pair read_sync(const entity::VariantKey& key, storage::ReadKeyOpts opts) override {
- return DecodeSegmentTask{}(read_sync_dispatch(key, library_, opts));
-}
+ bool scan_for_matching_key(KeyType key_type, const IterateTypePredicate& predicate) override {
+ return library_->scan_for_matching_key(key_type, predicate);
+ }
-folly::Future read_compressed(
- const entity::VariantKey &key,
- storage::ReadKeyOpts opts) override {
- return read_and_continue(key, library_, opts, PassThroughTask{});
-}
+ folly::Future> read(
+ const entity::VariantKey& key, storage::ReadKeyOpts opts
+ ) override {
+ return read_and_continue(key, library_, opts, DecodeSegmentTask{});
+ }
+
+ std::pair read_sync(const entity::VariantKey& key, storage::ReadKeyOpts opts)
+ override {
+ return DecodeSegmentTask{}(read_sync_dispatch(key, library_, opts));
+ }
-storage::KeySegmentPair read_compressed_sync(const entity::VariantKey& key, storage::ReadKeyOpts opts) override {
+ folly::Future read_compressed(const entity::VariantKey& key, storage::ReadKeyOpts opts)
+ override {
+ return read_and_continue(key, library_, opts, PassThroughTask{});
+ }
+
+ storage::KeySegmentPair read_compressed_sync(const entity::VariantKey& key, storage::ReadKeyOpts opts) override {
return read_sync_dispatch(key, library_, opts);
-}
+ }
-folly::Future, std::optional>> read_metadata(const entity::VariantKey &key, storage::ReadKeyOpts opts) override {
- return read_and_continue(key, library_, opts, DecodeMetadataTask{});
-}
+ folly::Future, std::optional>> read_metadata(
+ const entity::VariantKey& key, storage::ReadKeyOpts opts
+ ) override {
+ return read_and_continue(key, library_, opts, DecodeMetadataTask{});
+ }
-folly::Future, StreamDescriptor>> read_metadata_and_descriptor(
- const entity::VariantKey &key,
- storage::ReadKeyOpts opts) override {
- return read_and_continue(key, library_, opts, DecodeMetadataAndDescriptorTask{});
-}
+ folly::Future, StreamDescriptor>>
+ read_metadata_and_descriptor(const entity::VariantKey& key, storage::ReadKeyOpts opts) override {
+ return read_and_continue(key, library_, opts, DecodeMetadataAndDescriptorTask{});
+ }
-folly::Future> read_timeseries_descriptor(
- const entity::VariantKey &key,
- storage::ReadKeyOpts opts) override {
- return read_and_continue(key, library_, opts, DecodeTimeseriesDescriptorTask{});
-}
+ folly::Future> read_timeseries_descriptor(
+ const entity::VariantKey& key, storage::ReadKeyOpts opts
+ ) override {
+ return read_and_continue(key, library_, opts, DecodeTimeseriesDescriptorTask{});
+ }
-folly::Future