Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit a5274d2

Browse files
committed
store: Enable using large Bytes field
1 parent cb79030 commit a5274d2

File tree

3 files changed

+220
-54
lines changed

3 files changed

+220
-54
lines changed

store/postgres/src/relational.rs

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ const DELETE_OPERATION_CHUNK_SIZE: usize = 1_000;
5858
/// This also makes sure that we do not put strings into a BTree index that's
5959
/// bigger than Postgres' limit on such strings which is about 2k
6060
pub const STRING_PREFIX_SIZE: usize = 256;
61+
pub const BYTE_ARRAY_PREFIX_SIZE: usize = 64;
6162

6263
lazy_static! {
6364
/// Deprecated; use 'graphman stats account-like' instead. A list of
@@ -1163,7 +1164,9 @@ impl Column {
11631164
/// lengths. Such columns may contain very large values and need to be
11641165
/// handled specially for indexing
11651166
pub fn has_arbitrary_size(&self) -> bool {
1166-
!self.is_primary_key() && !self.is_list() && self.column_type == ColumnType::String
1167+
!self.is_primary_key()
1168+
&& !self.is_list()
1169+
&& (self.column_type == ColumnType::String || self.column_type == ColumnType::Bytes)
11671170
}
11681171

11691172
pub fn is_assignable_from(&self, source: &Self, object: &EntityType) -> Option<String> {
@@ -1486,12 +1489,23 @@ impl Table {
14861489
("gist", index_expr)
14871490
}
14881491
} else {
1489-
// Attributes that are plain strings are indexed with a BTree; but
1490-
// they can be too large for Postgres' limit on values that can go
1491-
// into a BTree. For those attributes, only index the first
1492-
// STRING_PREFIX_SIZE characters
1492+
// Attributes that are plain strings or bytes are
1493+
// indexed with a BTree; but they can be too large for
1494+
// Postgres' limit on values that can go into a BTree.
1495+
// For those attributes, only index the first
1496+
// STRING_PREFIX_SIZE or BYTE_ARRAY_PREFIX_SIZE characters
14931497
let index_expr = if column.has_arbitrary_size() {
1494-
format!("left({}, {})", column.name.quoted(), STRING_PREFIX_SIZE)
1498+
match column.column_type {
1499+
ColumnType::String => {
1500+
format!("left({}, {})", column.name.quoted(), STRING_PREFIX_SIZE)
1501+
}
1502+
ColumnType::Bytes => format!(
1503+
"substring({}, 1, {})",
1504+
column.name.quoted(),
1505+
BYTE_ARRAY_PREFIX_SIZE
1506+
),
1507+
_ => unreachable!("only String and Bytes can have arbitrary size"),
1508+
}
14951509
} else {
14961510
column.name.quoted()
14971511
};
@@ -1895,7 +1909,7 @@ create index attr_1_3_scalar_big_decimal
18951909
create index attr_1_4_scalar_string
18961910
on sgd0815.\"scalar\" using btree(left(\"string\", 256));
18971911
create index attr_1_5_scalar_bytes
1898-
on sgd0815.\"scalar\" using btree(\"bytes\");
1912+
on sgd0815.\"scalar\" using btree(substring(\"bytes\", 1, 64));
18991913
create index attr_1_6_scalar_big_int
19001914
on sgd0815.\"scalar\" using btree(\"big_int\");
19011915
create index attr_1_7_scalar_color

store/postgres/src/relational_queries.rs

Lines changed: 106 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ use std::iter::FromIterator;
3232
use std::str::FromStr;
3333

3434
use crate::relational::{
35-
Column, ColumnType, IdType, Layout, SqlName, Table, PRIMARY_KEY_COLUMN, STRING_PREFIX_SIZE,
35+
Column, ColumnType, IdType, Layout, SqlName, Table, BYTE_ARRAY_PREFIX_SIZE, PRIMARY_KEY_COLUMN,
36+
STRING_PREFIX_SIZE,
3637
};
3738
use crate::sql_value::SqlValue;
3839
use crate::{
@@ -665,40 +666,110 @@ impl Comparison {
665666
}
666667
}
667668

669+
enum PrefixType<'a> {
670+
String(&'a Column),
671+
Bytes(&'a Column),
672+
}
673+
674+
impl<'a> PrefixType<'a> {
675+
fn new(column: &'a Column) -> QueryResult<Self> {
676+
match column.column_type {
677+
ColumnType::String => Ok(PrefixType::String(column)),
678+
ColumnType::Bytes => Ok(PrefixType::Bytes(column)),
679+
_ => Err(constraint_violation!(
680+
"cannot setup prefix comparison for column {} of type {}",
681+
column.name(),
682+
column.column_type().sql_type()
683+
)),
684+
}
685+
}
686+
687+
/// Push the SQL expression for a prefix of values in our column. That
688+
/// should be the same expression that we used when creating an index
689+
/// for the column
690+
fn push_column_prefix(&self, out: &mut AstPass<Pg>) -> QueryResult<()> {
691+
match self {
692+
PrefixType::String(column) => {
693+
out.push_sql("left(");
694+
out.push_identifier(column.name.as_str())?;
695+
out.push_sql(", ");
696+
out.push_sql(&STRING_PREFIX_SIZE.to_string());
697+
out.push_sql(")");
698+
}
699+
PrefixType::Bytes(column) => {
700+
out.push_sql("substring(");
701+
out.push_identifier(column.name.as_str())?;
702+
out.push_sql(", 1, ");
703+
out.push_sql(&BYTE_ARRAY_PREFIX_SIZE.to_string());
704+
out.push_sql(")");
705+
}
706+
}
707+
Ok(())
708+
}
709+
710+
fn is_large(&self, value: &Value) -> Result<bool, ()> {
711+
match (self, value) {
712+
(PrefixType::String(_), Value::String(s)) => Ok(s.len() > STRING_PREFIX_SIZE - 1),
713+
(PrefixType::Bytes(_), Value::Bytes(b)) => Ok(b.len() > BYTE_ARRAY_PREFIX_SIZE - 1),
714+
(PrefixType::Bytes(_), Value::String(s)) => {
715+
let len = if s.starts_with("0x") {
716+
(s.len() - 2) / 2
717+
} else {
718+
s.len() / 2
719+
};
720+
Ok(len > BYTE_ARRAY_PREFIX_SIZE - 1)
721+
}
722+
_ => Err(()),
723+
}
724+
}
725+
}
726+
668727
/// Produce a comparison between the string column `column` and the string
669728
/// value `text` that makes it obvious to Postgres' optimizer that it can
670729
/// first consult the partial index on `left(column, STRING_PREFIX_SIZE)`
671730
/// instead of going straight to a sequential scan of the underlying table.
672731
/// We do this by writing the comparison `column op text` in a way that
673732
/// involves `left(column, STRING_PREFIX_SIZE)`
674-
#[derive(Constructor)]
675733
struct PrefixComparison<'a> {
676734
op: Comparison,
735+
kind: PrefixType<'a>,
677736
column: &'a Column,
678737
text: &'a Value,
679738
}
680739

681740
impl<'a> PrefixComparison<'a> {
682-
fn push_column_prefix(column: &Column, mut out: AstPass<Pg>) -> QueryResult<()> {
683-
out.push_sql("left(");
684-
out.push_identifier(column.name.as_str())?;
685-
out.push_sql(", ");
686-
out.push_sql(&STRING_PREFIX_SIZE.to_string());
687-
out.push_sql(")");
688-
Ok(())
741+
fn new(op: Comparison, column: &'a Column, text: &'a Value) -> QueryResult<Self> {
742+
let kind = PrefixType::new(column)?;
743+
Ok(Self {
744+
op,
745+
kind,
746+
column,
747+
text,
748+
})
689749
}
690750

691751
fn push_value_prefix(&self, mut out: AstPass<Pg>) -> QueryResult<()> {
692-
out.push_sql("left(");
693-
QueryValue(self.text, &self.column.column_type).walk_ast(out.reborrow())?;
694-
out.push_sql(", ");
695-
out.push_sql(&STRING_PREFIX_SIZE.to_string());
696-
out.push_sql(")");
752+
match self.kind {
753+
PrefixType::String(column) => {
754+
out.push_sql("left(");
755+
QueryValue(self.text, &column.column_type).walk_ast(out.reborrow())?;
756+
out.push_sql(", ");
757+
out.push_sql(&STRING_PREFIX_SIZE.to_string());
758+
out.push_sql(")");
759+
}
760+
PrefixType::Bytes(column) => {
761+
out.push_sql("substring(");
762+
QueryValue(self.text, &column.column_type).walk_ast(out.reborrow())?;
763+
out.push_sql(", 1, ");
764+
out.push_sql(&BYTE_ARRAY_PREFIX_SIZE.to_string());
765+
out.push_sql(")");
766+
}
767+
}
697768
Ok(())
698769
}
699770

700771
fn push_prefix_cmp(&self, op: Comparison, mut out: AstPass<Pg>) -> QueryResult<()> {
701-
Self::push_column_prefix(self.column, out.reborrow())?;
772+
self.kind.push_column_prefix(&mut out)?;
702773
out.push_sql(op.as_str());
703774
self.push_value_prefix(out.reborrow())
704775
}
@@ -749,18 +820,16 @@ impl<'a> QueryFragment<Pg> for PrefixComparison<'a> {
749820
//
750821
// For `op` either `<=` or `>=`, we can write (using '<=' as an example)
751822
// uv <= st <=> u < s || u = s && uv <= st
752-
let large = if let Value::String(s) = self.text {
753-
// We need to check the entire string
754-
s.len() > STRING_PREFIX_SIZE - 1
755-
} else {
756-
return Err(constraint_violation!(
823+
let large = self.kind.is_large(&self.text).map_err(|()| {
824+
constraint_violation!(
757825
"column {} has type {} and can't be compared with the value `{}` using {}",
758826
self.column.name(),
759827
self.column.column_type().sql_type(),
760828
self.text,
761829
self.op.as_str()
762-
));
763-
};
830+
)
831+
})?;
832+
764833
match self.op {
765834
Equal => {
766835
if large {
@@ -961,35 +1030,25 @@ impl<'a> QueryFilter<'a> {
9611030
) -> QueryResult<()> {
9621031
let column = self.column(attribute);
9631032

964-
if column.has_arbitrary_size() {
965-
PrefixComparison::new(op, column, value).walk_ast(out.reborrow())?;
1033+
if matches!(value, Value::Null) {
1034+
// Deal with nulls first since they always need special
1035+
// treatment
1036+
out.push_identifier(column.name.as_str())?;
1037+
match op {
1038+
Comparison::Equal => out.push_sql(" is null"),
1039+
Comparison::NotEqual => out.push_sql(" is not null"),
1040+
_ => unreachable!("we only call equals with '=' or '!='"),
1041+
}
1042+
} else if column.has_arbitrary_size() {
1043+
PrefixComparison::new(op, column, value)?.walk_ast(out.reborrow())?;
9661044
} else if column.is_fulltext() {
9671045
out.push_identifier(column.name.as_str())?;
9681046
out.push_sql(Comparison::Match.as_str());
9691047
QueryValue(value, &column.column_type).walk_ast(out)?;
9701048
} else {
9711049
out.push_identifier(column.name.as_str())?;
972-
973-
match value {
974-
Value::String(_)
975-
| Value::BigInt(_)
976-
| Value::Bool(_)
977-
| Value::Bytes(_)
978-
| Value::BigDecimal(_)
979-
| Value::Int(_)
980-
| Value::List(_) => {
981-
out.push_sql(op.as_str());
982-
QueryValue(value, &column.column_type).walk_ast(out)?;
983-
}
984-
Value::Null => {
985-
use Comparison as c;
986-
match op {
987-
c::Equal => out.push_sql(" is null"),
988-
c::NotEqual => out.push_sql(" is not null"),
989-
_ => unreachable!("we only call equals with '=' or '!='"),
990-
}
991-
}
992-
}
1050+
out.push_sql(op.as_str());
1051+
QueryValue(value, &column.column_type).walk_ast(out)?;
9931052
}
9941053
Ok(())
9951054
}
@@ -1004,7 +1063,7 @@ impl<'a> QueryFilter<'a> {
10041063
let column = self.column(attribute);
10051064

10061065
if column.has_arbitrary_size() {
1007-
PrefixComparison::new(op, column, value).walk_ast(out.reborrow())?;
1066+
PrefixComparison::new(op, column, value)?.walk_ast(out.reborrow())?;
10081067
} else {
10091068
out.push_identifier(column.name.as_str())?;
10101069
out.push_sql(op.as_str());
@@ -1086,7 +1145,7 @@ impl<'a> QueryFilter<'a> {
10861145
// Postgres' query optimizer
10871146
// See PrefixComparison for a more detailed discussion of what
10881147
// is happening here
1089-
PrefixComparison::push_column_prefix(column, out.reborrow())?;
1148+
PrefixType::new(column)?.push_column_prefix(&mut out)?;
10901149
} else {
10911150
out.push_identifier(column.name.as_str())?;
10921151
}

store/postgres/tests/store.rs

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1594,6 +1594,99 @@ fn handle_large_string_with_index() {
15941594
})
15951595
}
15961596

1597+
#[test]
1598+
fn handle_large_bytea_with_index() {
1599+
const NAME: &str = "bin_name";
1600+
const ONE: &str = "large_string_one";
1601+
const TWO: &str = "large_string_two";
1602+
1603+
fn make_insert_op(id: &str, name: &[u8]) -> EntityModification {
1604+
let mut data = Entity::new();
1605+
data.set("id", id);
1606+
data.set(NAME, scalar::Bytes::from(name));
1607+
1608+
let key = EntityKey::data(TEST_SUBGRAPH_ID.clone(), USER.to_owned(), id.to_owned());
1609+
1610+
EntityModification::Insert { key, data }
1611+
}
1612+
1613+
run_test(|store, writable, deployment| async move {
1614+
// We have to produce a massive bytea (240_000 bytes) because the
1615+
// repeated text compresses so well. This leads to an error 'index
1616+
// row size 2784 exceeds btree version 4 maximum 2704' if used with
1617+
// a btree index without size limitation
1618+
let long_bytea = std::iter::repeat("Quo usque tandem")
1619+
.take(15000)
1620+
.collect::<String>()
1621+
.into_bytes();
1622+
let other_bytea = {
1623+
let mut other_bytea = long_bytea.clone();
1624+
other_bytea.push(b'X');
1625+
scalar::Bytes::from(other_bytea.as_slice())
1626+
};
1627+
let long_bytea = scalar::Bytes::from(long_bytea.as_slice());
1628+
1629+
let metrics_registry = Arc::new(MockMetricsRegistry::new());
1630+
let stopwatch_metrics = StopwatchMetrics::new(
1631+
Logger::root(slog::Discard, o!()),
1632+
deployment.hash.clone(),
1633+
metrics_registry.clone(),
1634+
);
1635+
1636+
writable
1637+
.transact_block_operations(
1638+
TEST_BLOCK_3_PTR.clone(),
1639+
None,
1640+
vec![
1641+
make_insert_op(ONE, &long_bytea),
1642+
make_insert_op(TWO, &other_bytea),
1643+
],
1644+
stopwatch_metrics,
1645+
Vec::new(),
1646+
Vec::new(),
1647+
)
1648+
.expect("Failed to insert large text");
1649+
1650+
let query = user_query()
1651+
.first(5)
1652+
.filter(EntityFilter::Equal(
1653+
NAME.to_owned(),
1654+
long_bytea.clone().into(),
1655+
))
1656+
.asc(NAME);
1657+
1658+
let ids = store
1659+
.subgraph_store()
1660+
.find(query)
1661+
.expect("Could not find entity")
1662+
.iter()
1663+
.map(|e| e.id())
1664+
.collect::<Result<Vec<_>, _>>()
1665+
.expect("Found entities without an id");
1666+
1667+
assert_eq!(vec![ONE], ids);
1668+
1669+
// Make sure we check the full string and not just a prefix
1670+
let prefix = scalar::Bytes::from(&long_bytea.as_slice()[..64]);
1671+
let query = user_query()
1672+
.first(5)
1673+
.filter(EntityFilter::LessOrEqual(NAME.to_owned(), prefix.into()))
1674+
.asc(NAME);
1675+
1676+
let ids = store
1677+
.subgraph_store()
1678+
.find(query)
1679+
.expect("Could not find entity")
1680+
.iter()
1681+
.map(|e| e.id())
1682+
.collect::<Result<Vec<_>, _>>()
1683+
.expect("Found entities without an id");
1684+
1685+
// Users with name 'Cindini' and 'Johnton'
1686+
assert_eq!(vec!["2", "1"], ids);
1687+
})
1688+
}
1689+
15971690
#[derive(Clone)]
15981691
struct WindowQuery(EntityQuery, Arc<DieselSubgraphStore>);
15991692

0 commit comments

Comments
 (0)