Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit e88fc32

Browse files
authored
Merge pull request biolab#6389 from janezd/merge-nonunique-right
Merge Data: Allow non-unique values if they're unused
2 parents b909f89 + 2ba3c86 commit e88fc32

File tree

3 files changed

+189
-21
lines changed

3 files changed

+189
-21
lines changed

Orange/widgets/data/owmergedata.py

Lines changed: 54 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -294,21 +294,30 @@ class Outputs:
294294
class Warning(widget.OWWidget.Warning):
295295
renamed_vars = Msg("Some variables have been renamed "
296296
"to avoid duplicates.\n{}")
297+
nonunique_left = Msg(
298+
"Some (unused) combinations of values in Data appear in "
299+
"multiple rows.")
300+
nonunique_right = Msg(
301+
"Some (unused) combinations of values in Extra Data appear in "
302+
"multiple rows.")
297303

298304
class Error(widget.OWWidget.Error):
299305
matching_numeric_with_nonnum = Msg(
300306
"Numeric and non-numeric columns ({} and {}) cannot be matched.")
301307
matching_index_with_sth = Msg("Row index cannot be matched with {}.")
302308
matching_id_with_sth = Msg("Instance cannot be matched with {}.")
309+
nonunique_left_matched = Msg(
310+
"Some combinations of values in Data appear in multiple rows."
311+
"\nEvery matched combination may appear at most once.")
303312
nonunique_left = Msg(
304-
"Some combinations of values on the left appear in multiple rows.\n"
305-
"For this type of merging, every possible combination of values "
306-
"on the left should appear at most once.")
313+
"Some combinations of values in Data appear in multiple rows."
314+
"\nEvery combination may appear at most once.")
315+
nonunique_right_matched = Msg(
316+
"Some combinations of values in Extra Data appear in multiple rows."
317+
"\nEvery matched combination may appear at most once.")
307318
nonunique_right = Msg(
308-
"Some combinations of values on the right appear in multiple rows."
309-
"\n"
310-
"Every possible combination of values on the right should appear "
311-
"at most once.")
319+
"Some combinations of values in Extra Data appear in multiple rows."
320+
"\nEvery combination may appear at most once.")
312321

313322
def __init__(self):
314323
super().__init__()
@@ -441,16 +450,47 @@ def _get_col_name(obj):
441450
return f"'{obj.name}'" if isinstance(obj, Variable) else obj.lower()
442451

443452
def _check_uniqueness(self, left, left_mask, right, right_mask):
444-
ok = True
453+
# Right table is always checked
445454
masked_right = right[right_mask]
446-
if len(set(map(tuple, masked_right))) != len(masked_right):
447-
self.Error.nonunique_right()
448-
ok = False
449-
if self.merging != self.LeftJoin:
455+
right_set = set(map(tuple, masked_right))
456+
right_duplicates = len(right_set) != len(masked_right)
457+
458+
# Left table is checked on non-left join; on left join it is needed
459+
# only to check whether right duplicates are critical
460+
left_duplicates = None
461+
if self.merging != self.LeftJoin or right_duplicates:
450462
masked_left = left[left_mask]
451-
if len(set(map(tuple, masked_left))) != len(masked_left):
452-
self.Error.nonunique_left()
463+
left_set = set(map(tuple, masked_left))
464+
left_duplicates = len(left_set) != len(masked_left)
465+
466+
# Handle outer join and exit
467+
if self.merging == self.OuterJoin:
468+
self.Error.nonunique_left(shown=left_duplicates)
469+
self.Error.nonunique_right(shown=right_duplicates)
470+
return not (left_duplicates or right_duplicates)
471+
472+
# Intersection is needed to check whether duplicates are critical;
473+
if left_duplicates or right_duplicates:
474+
n_inter = len(left_set & right_set)
475+
476+
ok = True
477+
478+
if right_duplicates:
479+
# `sum` counts the number of times that masked_right items are used.
480+
# If this equals the intersection, each is used just once.
481+
if sum(tuple(mr) in left_set for mr in masked_right) == n_inter:
482+
self.Warning.nonunique_right()
483+
else:
484+
self.Error.nonunique_right_matched()
453485
ok = False
486+
487+
if self.merging == self.InnerJoin and left_duplicates:
488+
if sum(tuple(ml) in right_set for ml in masked_left) == n_inter:
489+
self.Warning.nonunique_left()
490+
else:
491+
self.Error.nonunique_left_matched()
492+
ok = False
493+
454494
return ok
455495

456496
def _compute_reduced_extra_data(self,

Orange/widgets/data/tests/test_owmergedata.py

Lines changed: 125 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -838,62 +838,182 @@ def test_nonunique(self):
838838

839839
self.assertFalse(widget.Error.nonunique_left.is_shown())
840840
self.assertFalse(widget.Error.nonunique_right.is_shown())
841+
self.assertFalse(widget.Warning.nonunique_right.is_shown())
841842

842843
widget.attr_boxes.set_state([(INSTANCEID, INSTANCEID)])
843844
widget.commit.now()
844845
self.assertFalse(widget.Error.nonunique_left.is_shown())
845846
self.assertFalse(widget.Error.nonunique_right.is_shown())
847+
self.assertFalse(widget.Warning.nonunique_right.is_shown())
846848
self.assertIsNotNone(self.get_output(widget.Outputs.data))
847849

848850
widget.attr_boxes.set_state([(INDEX, INDEX)])
849851
widget.commit.now()
850852
self.assertFalse(widget.Error.nonunique_left.is_shown())
851853
self.assertFalse(widget.Error.nonunique_right.is_shown())
854+
self.assertFalse(widget.Warning.nonunique_right.is_shown())
852855
self.assertIsNotNone(self.get_output(widget.Outputs.data))
853856

854857
widget.attr_boxes.set_state([(x, x)])
855858
widget.commit.now()
856-
self.assertTrue(widget.Error.nonunique_left.is_shown())
859+
self.assertTrue(widget.Error.nonunique_left_matched.is_shown())
857860
self.assertFalse(widget.Error.nonunique_right.is_shown())
861+
self.assertFalse(widget.Warning.nonunique_right.is_shown())
858862
self.assertIsNone(self.get_output(widget.Outputs.data))
859863

860864
widget.merging = widget.LeftJoin
861865
widget.commit.now()
862866
self.assertFalse(widget.Error.nonunique_left.is_shown())
863867
self.assertFalse(widget.Error.nonunique_right.is_shown())
868+
self.assertFalse(widget.Warning.nonunique_right.is_shown())
864869
self.assertIsNotNone(self.get_output(widget.Outputs.data))
865870

866871
widget.merging = widget.InnerJoin
867872
widget.attr_boxes.set_state([(x, x), (d, d)])
868873
widget.commit.now()
869874
self.assertFalse(widget.Error.nonunique_left.is_shown())
870875
self.assertFalse(widget.Error.nonunique_right.is_shown())
876+
self.assertFalse(widget.Warning.nonunique_right.is_shown())
871877
self.assertIsNotNone(self.get_output(widget.Outputs.data))
872878

873879
widget.attr_boxes.set_state([(d, d)])
874880
widget.commit.now()
875-
self.assertTrue(widget.Error.nonunique_left.is_shown())
876-
self.assertTrue(widget.Error.nonunique_right.is_shown())
881+
self.assertTrue(widget.Error.nonunique_left_matched.is_shown())
882+
self.assertTrue(widget.Error.nonunique_right_matched.is_shown())
883+
self.assertFalse(widget.Warning.nonunique_right.is_shown())
877884
self.assertIsNone(self.get_output(widget.Outputs.data))
878885

879886
widget.merging = widget.LeftJoin
880887
widget.commit.now()
881888
self.assertFalse(widget.Error.nonunique_left.is_shown())
882-
self.assertTrue(widget.Error.nonunique_right.is_shown())
889+
self.assertTrue(widget.Error.nonunique_right_matched.is_shown())
890+
self.assertFalse(widget.Warning.nonunique_right.is_shown())
883891
self.assertIsNone(self.get_output(widget.Outputs.data))
884892

885893
widget.merging = widget.InnerJoin
886894
widget.commit.now()
887-
self.assertTrue(widget.Error.nonunique_left.is_shown())
895+
self.assertTrue(widget.Error.nonunique_left_matched.is_shown())
896+
self.assertTrue(widget.Error.nonunique_right_matched.is_shown())
897+
self.assertFalse(widget.Warning.nonunique_right.is_shown())
898+
self.assertIsNone(self.get_output(widget.Outputs.data))
899+
900+
self.send_signal(widget.Inputs.data, None)
901+
self.send_signal(widget.Inputs.extra_data, None)
902+
self.assertFalse(widget.Error.nonunique_left.is_shown())
903+
self.assertFalse(widget.Error.nonunique_right.is_shown())
904+
self.assertFalse(widget.Warning.nonunique_right.is_shown())
905+
self.assertIsNone(self.get_output(widget.Outputs.data))
906+
907+
def test_nonunique_warning(self):
908+
widget = self.widget
909+
x = ContinuousVariable("x")
910+
d = DiscreteVariable("d", values=tuple("abc"))
911+
domain = Domain([x, d], [])
912+
dataA = Table.from_numpy(
913+
domain, np.array([[1.0, 0], [2, 1]]))
914+
dataB = Table.from_numpy(
915+
domain, np.array([[1.0, 0], [2, 1], [3, 1], [3, 1]]))
916+
dataB.ids = dataA.ids
917+
918+
919+
self.send_signal(widget.Inputs.data, dataA)
920+
self.send_signal(widget.Inputs.extra_data, dataB)
921+
widget.attr_boxes.set_state([(x, x)])
922+
923+
widget.merging = widget.LeftJoin
924+
widget.commit.now()
925+
self.assertFalse(widget.Error.nonunique_left.is_shown())
926+
self.assertFalse(widget.Error.nonunique_right.is_shown())
927+
self.assertFalse(widget.Error.nonunique_left_matched.is_shown())
928+
self.assertFalse(widget.Error.nonunique_right_matched.is_shown())
929+
self.assertFalse(widget.Warning.nonunique_left.is_shown())
930+
self.assertTrue(widget.Warning.nonunique_right.is_shown())
931+
self.assertIsNotNone(self.get_output(widget.Outputs.data))
932+
933+
widget.merging = widget.InnerJoin
934+
widget.commit.now()
935+
self.assertFalse(widget.Error.nonunique_left.is_shown())
936+
self.assertFalse(widget.Error.nonunique_right.is_shown())
937+
self.assertFalse(widget.Error.nonunique_left_matched.is_shown())
938+
self.assertFalse(widget.Error.nonunique_right_matched.is_shown())
939+
self.assertFalse(widget.Warning.nonunique_left.is_shown())
940+
self.assertTrue(widget.Warning.nonunique_right.is_shown())
941+
self.assertIsNotNone(self.get_output(widget.Outputs.data))
942+
943+
widget.merging = widget.OuterJoin
944+
widget.attr_boxes.set_state([(x, x), (d, d)])
945+
widget.commit.now()
946+
self.assertFalse(widget.Error.nonunique_left.is_shown())
888947
self.assertTrue(widget.Error.nonunique_right.is_shown())
948+
self.assertFalse(widget.Error.nonunique_left_matched.is_shown())
949+
self.assertFalse(widget.Error.nonunique_right_matched.is_shown())
950+
self.assertFalse(widget.Warning.nonunique_left.is_shown())
951+
self.assertFalse(widget.Warning.nonunique_right.is_shown())
889952
self.assertIsNone(self.get_output(widget.Outputs.data))
890953

891954
self.send_signal(widget.Inputs.data, None)
892955
self.send_signal(widget.Inputs.extra_data, None)
893956
self.assertFalse(widget.Error.nonunique_left.is_shown())
894957
self.assertFalse(widget.Error.nonunique_right.is_shown())
958+
self.assertFalse(widget.Error.nonunique_left_matched.is_shown())
959+
self.assertFalse(widget.Error.nonunique_right_matched.is_shown())
960+
self.assertFalse(widget.Warning.nonunique_left.is_shown())
961+
self.assertFalse(widget.Warning.nonunique_right.is_shown())
895962
self.assertIsNone(self.get_output(widget.Outputs.data))
896963

964+
def test_check_uniqueness(self):
965+
# Above test_nonunique and test_nonunique_warning tests a larger
966+
# flow within the widget; this one tests the particular function,
967+
# check_uniqueneess, which performs the check
968+
969+
def test(left, right, indicators):
970+
aleft = np.vstack((left, np.zeros(len(left)))).T
971+
aright = np.vstack((right, np.zeros(len(right)))).T
972+
for w.merging, indi, msg in zip(
973+
(w.LeftJoin, w.InnerJoin, w.OuterJoin),
974+
indicators,
975+
("left", "inner", "outer")):
976+
if isinstance(indi, int):
977+
indi = (indi, )
978+
w.Error.clear()
979+
w.Warning.clear()
980+
w._check_uniqueness(np.array(aleft), mask[:len(left)],
981+
np.array(aright), mask[:len(right)])
982+
self.assertIs(w.Error.nonunique_left_matched.is_shown(), elm in indi, msg)
983+
self.assertIs(w.Error.nonunique_right_matched.is_shown(), erm in indi, msg)
984+
self.assertIs(w.Error.nonunique_left.is_shown(), el in indi, msg)
985+
self.assertIs(w.Error.nonunique_right.is_shown(), er in indi, msg)
986+
self.assertIs(w.Warning.nonunique_left.is_shown(), wl in indi, msg)
987+
self.assertIs(w.Warning.nonunique_right.is_shown(), wr in indi, msg)
988+
989+
mask = np.array([False, False, True, True, True, True])
990+
seq1234 = (0, 0, 1, 2, 3, 4)
991+
seq567 = (0, 0, 5, 6, 7)
992+
seq1233 = (0, 0, 1, 2, 3, 3)
993+
seq1255 = (0, 0, 1, 2, 5, 5)
994+
wl, wr, elm, erm, el, er = range(6)
995+
w = self.widget
996+
997+
# no duplicates
998+
test(seq1234, seq567, [()] * 3)
999+
test(seq1234, seq1234, [()] * 3)
1000+
1001+
# used duplicates on right: always error
1002+
test(seq1234, seq1233, [erm, erm, er])
1003+
1004+
# unused duplicates on right: error on outer, warning elsewhere
1005+
test(seq1234, seq1255, [wr, wr, er])
1006+
1007+
# (unused) duplicates on left: left is ok, inner warns, outer errors
1008+
test(seq1255, seq1234, [(), wl, el])
1009+
1010+
# duplicates on both sides: always error
1011+
test(seq1255, seq1255, [erm, (elm, erm), (el, er)])
1012+
1013+
# unused duplicates on both sides:
1014+
# left warns about right, inner warns both, outer errors both
1015+
test(seq1233, seq1255, [wr, (wl, wr), (el, er)])
1016+
8971017
def test_invalide_pairs(self):
8981018
widget = self.widget
8991019
x = ContinuousVariable("x")

doc/visual-programming/source/widgets/data/mergedata.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ Merges two datasets, based on values of selected attributes.
1414

1515
The **Merge Data** widget is used to horizontally merge two datasets, based on the values of selected attributes (columns). In the input, two datasets are required, data and extra data. Rows from the two data sets are matched by the values of pairs of attributes, chosen by the user. The widget produces one output. It corresponds to the instances from the input data to which attributes (columns) from input extra data are appended.
1616

17-
If the selected attribute pair does not contain unique values (in other words, the attributes have duplicate values), the widget will give a warning. Instead, one can match by more than one attribute. Click on the plus icon to add the attribute to merge on. The final result has to be a unique combination for each individual row.
17+
To match by a combination of features click on the plus icon to add the features to merge on.
18+
19+
Depending upon the merge types, selected features may be required to have unique values (that is, no duplicates) in the data. When merging by multiple features, this pertains to a combinations of their values.
1820

1921
![](images/Merge-Data-stamped.png)
2022

@@ -39,13 +41,17 @@ For example, the first table may contain city names and the second would be a li
3941

4042
In our example, the first Data input contained 6 cities, but the Extra Data did not provide Lat and Lon values for Bratislava, so the fields will be empty.
4143

44+
For this type of merge, the values on the left (e.g. cities) may repeat (e.g. the same city appear multiple times), while the *used* value on the right must not. For example, let the right-hand table contain multiple Springfields. If Springfield does not appear on the left, the widget will show a warning but still merge the data. If Springfield does appear on the left as well, the widget will show an error. This can be resolved if the both table also include the data on the state (e.g. Illinois, Missouri, Oregon, Ohio) and this feature is added to the combination being matched.
45+
4246
![](images/MergeData_Append.png)
4347

4448
#####Find matching pairs of rows (inner join)
4549

4650
Only those rows that are matched will be present on the output, with the Extra Data columns appended. Rows without matches are removed.
4751

48-
In our example, Bratislava from the Data input did not have Lat and Lon values, while Belgrade from the Extra Data could not be found in the City column we were merging on. Hence both instances are remove - only the intersection of instances is sent to the output.
52+
In our example, Bratislava from the Data input did not have Lat and Lon values, while Belgrade from the Extra Data could not be found in the City column we were merging on. Hence both instances are removed - only the intersection of instances is sent to the output.
53+
54+
For this type of merge, combinations of features on the left and on the right must be unique.
4955

5056
![](images/MergeData_Intersection.png)
5157

@@ -55,6 +61,8 @@ The rows from both the Data and the Extra Data will be present on the output. Wh
5561

5662
In our example, both Bratislava and Belgrade are now present. Bratislava will have missing Lat and Lon values, while Belgrade will have a missing Population value.
5763

64+
For this type of merge, combinations of features on the left and on the right must be unique.
65+
5866
![](images/MergeData_Concatenate.png)
5967

6068
#####Row index

0 commit comments

Comments
 (0)