diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..4c91fbf2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# Ignore (I)Pyhton and GNU R trash +**/.ipynb_checkpoints/ +**/__pycache__/ +**/.idea/ +**/.Rhistory + +# Ignore trash +**/.DS_Store +**/._* +**/.~lock.* diff --git a/01_ud_preprocessing.py b/01_ud_preprocessing.py new file mode 100644 index 00000000..3486f330 --- /dev/null +++ b/01_ud_preprocessing.py @@ -0,0 +1,556 @@ +# +# Convert gold standard UD corpus to training/experiments format: +# 1) EstNLTK's format: add EstNLTK's automatic morphological +# annotations to CONLL files; +# 2) UD format: clean annotations by removing null nodes, +# deps and misc values (optional), and copy gold standard +# CONLL files to new location; +# + +import re +import os +import os.path +import sys + +from datetime import datetime +from random import Random +import configparser + +import conllu + +from estnltk import Text +from estnltk.taggers import Tagger +from estnltk.taggers import VabamorfTagger +from estnltk.taggers import MorphExtendedTagger +from estnltk.taggers import WhiteSpaceTokensTagger +from estnltk.taggers import PretokenizedTextCompoundTokensTagger + +from estnltk.converters.conll.conll_importer import conll_to_text + +# =============================================================== +# Convert UD corpora to training/experiments format: +# 1) EstNLTK's format: add EstNLTK's morphological annotations; +# 2) UD format: clean annotations by removing null nodes, +# deps and misc values; +# (MAIN) +# =============================================================== + +def convert_to_estnltk_conllu_main( conf_file, verbose=True ): + ''' + Converts gold standard CONLL-U files to training/experiments format. + Settings/parameters of the conversion will be read from the given + `conf_file`. + Executes sections in the configuration starting with prefix 'preannotation_' + (add EstNLTK's morphological annotations) and 'copy_' (clean and copy files + with UD annotations). + See functions `convert_ud_conllu_to_estnltk_conllu(...)` and + `copy_and_clean_ud_conllu(...)` for details about the conversion + and possible parameters. + ''' + # Parse configuration file + config = configparser.ConfigParser() + if conf_file is None or not os.path.exists(conf_file): + raise FileNotFoundError("Config file {} does not exist".format(conf_file)) + if len(config.read(conf_file)) != 1: + raise ValueError("File {} is not accessible or is not in valid INI format".format(conf_file)) + morph_pipeline = [ WhiteSpaceTokensTagger(), + PretokenizedTextCompoundTokensTagger(), + VabamorfTagger(use_reorderer=True), + MorphExtendedTagger() ] + start = datetime.now() + section_found = False + for section in config.sections(): + if section.startswith('preannotation_'): + # Load preannotation configuration from the section + if not config.has_option(section, 'input_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "input_dir" parameter.') + input_dir = config[section]['input_dir'] + if not os.path.isdir(input_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "input_dir" value {input_dir!r} in {section!r}.') + if not config.has_option(section, 'morph_layer'): + raise ValueError(f'Error in {conf_file}: section {section} is missing "morph_layer" parameter.') + morph_layer = config[section]['morph_layer'] + if morph_layer == 'ud_morph_analysis': + # Add UDMorphConverter() to disambiguated morph + from estnltk.taggers import UDMorphConverter # requires estnltk 1.7.2+ + morph_pipeline.append( UDMorphConverter() ) + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_dir = config[section]['output_dir'] + seed = config[section].getint('seed', 43) + dictionarize = config[section].getboolean('dictionarize', True) + remove_empty_nodes = config[section].getboolean('remove_empty_nodes', True) + remove_deps = config[section].getboolean('remove_deps', True) + remove_misc = config[section].getboolean('remove_misc', True) + replace_lemma_by_root = config[section].getboolean('replace_lemma_by_root', False) + remove_metadata = config[section].getboolean('remove_metadata', False) + # Collect input files. Make possible output files and dir + input_files = [] + output_files = [] + for fname in os.listdir(input_dir): + if fname.endswith('.conllu'): + input_files.append(os.path.join(input_dir, fname)) + out_fname = fname.replace('.conllu', f'-{morph_layer}.conllu') + output_files.append(os.path.join(output_dir, out_fname)) + if not input_files: + raise Exception(f'(!) No conllu files found from "input_dir" {input_dir!r}.') + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + # Convert files + for in_file, out_file in zip(input_files, output_files): + if verbose: + print(f'Reannotating {in_file} with layer {morph_layer} ...') + convert_ud_conllu_to_estnltk_conllu( in_file, morph_pipeline, morph_layer, out_file, + dictionarize=dictionarize, + replace_lemma_by_root=replace_lemma_by_root, + remove_empty_nodes=remove_empty_nodes, + remove_metadata=remove_metadata, + remove_deps=remove_deps, + remove_misc=remove_misc, + seed=seed ) + section_found = True + if section.startswith('copy_'): + # Load copying configuration from the section + if not config.has_option(section, 'input_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "input_dir" parameter.') + input_dir = config[section]['input_dir'] + if not os.path.isdir(input_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "input_dir" value {input_dir!r} in {section!r}.') + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_dir = config[section]['output_dir'] + remove_empty_nodes = config[section].getboolean('remove_empty_nodes', True) + remove_deps = config[section].getboolean('remove_deps', True) + remove_misc = config[section].getboolean('remove_misc', True) + remove_metadata = config[section].getboolean('remove_metadata', False) + # Collect input files. Make possible output files and dir + input_files = [] + output_files = [] + for fname in os.listdir(input_dir): + if fname.endswith('.conllu'): + input_files.append(os.path.join(input_dir, fname)) + output_files.append(os.path.join(output_dir, fname)) + if not input_files: + raise Exception(f'(!) No conllu files found from "input_dir" {input_dir!r}.') + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + # Copy & clean files + for in_file, out_file in zip(input_files, output_files): + if verbose: + print(f'Copying & cleaning {in_file} ...') + copy_and_clean_ud_conllu( in_file, out_file, + remove_empty_nodes=remove_empty_nodes, + remove_deps=remove_deps, + remove_metadata=remove_metadata, + remove_misc=remove_misc ) + section_found = True + if section_found: + print(f'Total processing time: {datetime.now()-start}') + else: + print(f'No section starting with "preannotation_" or "copy_" in {conf_file}.') + + +def convert_ud_conllu_to_estnltk_conllu( in_file, morph_pipeline, morph_layer, out_file, + dictionarize=True, replace_lemma_by_root=False, + remove_empty_nodes=True, remove_metadata=False, + remove_deps=True, remove_misc=True, seed=43 ): + ''' + Reannotates `in_file` with given `morph_pipeline` and saves results into `out_file`. + During reannotation, values of `lemma`, `upos`, `xpos` and `feats` will be replaced + with corresponding automatically tagged values from `morph_layer` (either morph_analysis + or morph_extended type of layer). In case of ambiguity, annotation is chosen randomly. + Use `seed` to provide seed value for random choices. + `morph_pipeline` must be a list of taggers that can be applied on the `Text` object + loaded from `in_file`. Both `in_file` and `out_file` are CONLL-U format files. + + Requires EstNLTK version 1.7.2+. + + Parameters + ----------- + in_file + name/path of the CONLL-U format input file. + morph_pipeline + list of `Tagger`-s (in the order of layer dependencies) that can be applied + on the `Text` object loaded from `in_file`. + this pipeline should produce `morph_layer`, which is used to reannotate + the input file. + morph_layer + morphological annotations layer (either morph_analysis or morph_extended + type of layer) produced by `morph_pipeline`. + Values from this layer are used to reannotate values of `upos`, `xpos` + and `feats` in the input file. + out_file + name/path of the CONLL-U format output file. + dictionarize + If True (default), then values of `feats` will be converted to a dictionary of + features. + replace_lemma_by_root + If True, then lemmas will be replaced by root values from morph_analysis layer. + Default: False. + remove_metadata: + If True, then sentence metadata will be removed from the output conllu file. + This might be necessary if you want to create data for legacy parsers such as + MaltParser. + Default: False. + remove_empty_nodes + If True (default), then null / empty nodes (of the enhanced representation) will + be removed from the output. + remove_deps + If True (default), then values of `deps` field will be replaced with `_`. + remove_misc + If True (default), then values of `misc` field will be replaced with `_`. + ''' + # Validate input pipeline + assert isinstance(morph_pipeline, list) + assert isinstance(morph_layer, str) + has_morph_layer = False + for tagger in morph_pipeline: + assert isinstance(tagger, Tagger) + if morph_layer == tagger.output_layer: + has_morph_layer = True + if not has_morph_layer: + raise Exception(('(!) No tagger in the input pipeline {!r} creates '+\ + 'required layer {!r}.'.format(morph_pipeline, morph_layer))) + # Import text from conllu + with open(in_file, 'r', encoding='utf-8') as input_file: + conll_sentences = conllu.parse(input_file.read()) + text_obj = conll_to_text(in_file, remove_empty_nodes=remove_empty_nodes) + assert len(text_obj['sentences']) == len(conll_sentences), \ + ('(!) Mismatching sentence numbers in estnltk import ({})'+\ + ' and conllu import ({}).').format( len(text_obj['sentences']), \ + len(conll_sentences) ) + # Annotate text + for tagger in morph_pipeline: + tagger.tag( text_obj ) + # If required, remove orphans / null nodes + if remove_empty_nodes: + token_count = 0 + for sid, sentence in enumerate(conll_sentences): + removables = [] + for tid, token in enumerate(sentence): + token_id = token['id'] + if isinstance(token_id, tuple) and len(token_id) == 3 and token_id[1] == '.': + removables.append(token) + if removables: + for token in removables: + sentence.remove(token) + token_count += len(sentence) + assert token_count == len(text_obj[morph_layer]), \ + f'(!) Token count mismatch: tokens from CONLL file: {token_count} '+\ + f'vs tokens from EstNLTK annotated text: {len(text_obj[morph_layer])}.' + # Carry annotations over to TokenList-s + word_id = 0 + # In case of an ambiguity, pick random analysis. + # Fix seed for repeatability + rand = Random() + rand.seed( seed ) + for sid, sentence in enumerate(conll_sentences): + for tid, token in enumerate(sentence): + word_span = text_obj[morph_layer][word_id] + assert word_span.text == token["form"] + annotation = rand.choice(word_span.annotations) + if morph_layer in ['morph_analysis', 'morph_extended']: + token['upos'] = annotation['partofspeech'] + token['xpos'] = annotation['partofspeech'] + token['feats'] = annotation['form'] + token['lemma'] = annotation['lemma'] + if replace_lemma_by_root: + if 'root' in annotation: + token['lemma'] = annotation['root'] + else: + # Find the same analysis from morph_analysis layer + # Get lemma from there + word_span2 = text_obj['morph_analysis'][word_id] + for annotation2 in word_span2.annotations: + if annotation2['lemma'] == annotation['lemma'] and \ + annotation2['partofspeech'] == annotation['partofspeech']: + token['lemma'] = annotation2['root'] + break + # ? Override random pos with first pos (seems to be more accurate ?) + #token['upos'] = word_span.annotations[0]['partofspeech'] + #token['xpos'] = word_span.annotations[0]['partofspeech'] + if dictionarize: + # Format form as a dictionary + form_parts = annotation['form'].split() + token['feats'] = {f:f for f in form_parts} + elif morph_layer == 'ud_morph_analysis': + token['upos'] = annotation['upostag'] + token['xpos'] = annotation['xpostag'] + token['feats'] = annotation['feats'] + token['lemma'] = annotation['lemma'] + else: + raise Exception(f'(!) Unexpected morph_layer: {morph_layer!r}') + if len(token['feats']) == 0: + token['feats'] = None + if remove_misc and token['misc'] is not None: + token['misc'] = None + if remove_deps and token['deps'] is not None: + token['deps'] = None + word_id += 1 + #print(sentence.serialize()) + #print() + # Export annotated file + with open(out_file, 'w', encoding='utf-8') as out_file: + for sentence in conll_sentences: + if remove_metadata: + sentence.metadata = None + out_file.write( sentence.serialize() ) + + +def copy_and_clean_ud_conllu( in_file, out_file, remove_empty_nodes=True, + remove_metadata=False, remove_deps=True, remove_misc=True ): + ''' + Cleans `in_file` by removing empty nodes, deps and misc attributes, and + saves result as `out_file`. + Both `in_file` and `out_file` are CONLL-U format files. + Use this function to prepare data for experiments that use gold standard + UD morphological annotations. + + Parameters + ----------- + in_file + name/path of the CONLL-U format input file. + out_file + name/path of the CONLL-U format output file. + remove_empty_nodes + If True (default), then null / empty nodes (of the enhanced representation) will + be removed from the output. + remove_metadata: + If True, then sentence metadata will be removed from the output conllu file. + This might be necessary if you want to create data for legacy parsers such as + MaltParser. + Default: False. + remove_deps + If True (default), then values of `deps` field will be replaced with `_`. + remove_misc + If True (default), then values of `misc` field will be replaced with `_`. + ''' + # Import text from conllu + with open(in_file, 'r', encoding='utf-8') as input_file: + conll_sentences = conllu.parse(input_file.read()) + # If required, remove orphans / null nodes + if remove_empty_nodes: + for sid, sentence in enumerate(conll_sentences): + removables = [] + for tid, token in enumerate(sentence): + token_id = token['id'] + if isinstance(token_id, tuple) and len(token_id) == 3 and token_id[1] == '.': + removables.append(token) + if removables: + for token in removables: + sentence.remove(token) + # Clean annotations + for sid, sentence in enumerate(conll_sentences): + for tid, token in enumerate(sentence): + if remove_misc and token['misc'] is not None: + token['misc'] = None + if remove_deps and token['deps'] is not None: + token['deps'] = None + # Export annotated file + with open(out_file, 'w', encoding='utf-8') as out_file: + for sentence in conll_sentences: + if remove_metadata: + sentence.metadata = None + out_file.write( sentence.serialize() ) + + +# =============================================================== +# DEBUGGING: +# Convert UD corpora to EstNLTK format and +# compare against reference converted corpus +# =============================================================== + +def convert_and_compare_against_reference( in_file, morph_pipeline, morph_layer, ref_file, + dictionarize=True, remove_empty_nodes=False, + remove_misc=True, remove_deps=True, seed=43 ): + ''' + Reannotates `in_file` with given `morph_pipeline` and compares results against `ref_file`. + Outputs numers/percentages of matching `upos`, `lemma` and `feats` values. + For description of parameters, see `convert_ud_conllu_to_estnltk_conllu(...)`. + ''' + # Validate input pipeline + assert isinstance(morph_pipeline, list) + assert isinstance(morph_layer, str) + has_morph_layer = False + for tagger in morph_pipeline: + assert isinstance(tagger, Tagger) + if morph_layer == tagger.output_layer: + has_morph_layer = True + if not has_morph_layer: + raise Exception(('(!) No tagger in the input pipeline {!r} creates '+\ + 'required layer {!r}.'.format(morph_pipeline, morph_layer))) + # Import text from conllu + with open(in_file, 'r', encoding='utf-8') as input_file: + conll_sentences = conllu.parse(input_file.read()) + # Import reference conllu + with open(ref_file, 'r', encoding='utf-8') as input_file: + ref_conll_sentences = conllu.parse(input_file.read()) + text_obj = conll_to_text(in_file, remove_empty_nodes=remove_empty_nodes) + assert len(text_obj['sentences']) == len(conll_sentences), \ + ('(!) Mismatching sentence numbers in estnltk import ({})'+\ + ' and conllu import ({}).').format( len(text_obj['sentences']), \ + len(conll_sentences) ) + # Annotate text + for tagger in morph_pipeline: + tagger.tag( text_obj ) + # If required, remove orphans / null nodes + if remove_empty_nodes: + token_count = 0 + for sid, sentence in enumerate(conll_sentences): + removables = [] + for tid, token in enumerate(sentence): + token_id = token['id'] + if isinstance(token_id, tuple) and len(token_id) == 3 and token_id[1] == '.': + removables.append(token) + if removables: + for token in removables: + sentence.remove(token) + token_count += len(sentence) + assert token_count == len(text_obj[morph_layer]), \ + f'(!) Token count mismatch: tokens from CONLL file: {token_count} '+\ + f'vs tokens from EstNLTK annotated text: {len(text_obj[morph_layer])}.' + token_count = 0 + for sid, sentence in enumerate(ref_conll_sentences): + removables = [] + for tid, token in enumerate(sentence): + token_id = token['id'] + if isinstance(token_id, tuple) and len(token_id) == 3 and token_id[1] == '.': + removables.append(token) + if removables: + for token in removables: + sentence.remove(token) + token_count += len(sentence) + assert token_count == len(text_obj[morph_layer]), \ + f'(!) Token count mismatch: tokens from CONLL file: {token_count} '+\ + f'vs tokens from EstNLTK annotated text: {len(text_obj[morph_layer])}.' + # Carry annotations over to TokenList-s + # In case of an ambiguity, pick random analysis. + # Fix seed for repeatability + rand = Random() + rand.seed(seed) + word_id = 0 + matches_with_ref_pos = 0 + matches_with_ref_feats = 0 + matches_with_ref_lemma = 0 + matches_complete = 0 + for sid, sentence in enumerate(conll_sentences): + ref_sentence = ref_conll_sentences[sid] + for tid, token in enumerate(sentence): + word_span = text_obj[morph_layer][word_id] + assert word_span.text == token["form"], f'{word_span.text!r} vs {token["form"]!r}' + #if isinstance(token['id'], tuple): + # print('orphan:', token['id'], token['upos'], token['feats']) + annotation = rand.choice(word_span.annotations) + ref_token = ref_sentence[tid] + token['upos'] = annotation['partofspeech'] + token['xpos'] = annotation['partofspeech'] + token['feats'] = annotation['form'] + # ? Override random pos with first pos (seems to be more accurate ?) + #token['upos'] = word_span.annotations[0]['partofspeech'] + #token['xpos'] = word_span.annotations[0]['partofspeech'] + if dictionarize: + # Format form as a dictionary + form_parts = annotation['form'].split() + token['feats'] = {f:f for f in form_parts} + if len(token['feats']) == 0: + token['feats'] = None + if remove_misc and token['misc'] is not None: + token['misc'] = None + if remove_deps and token['deps'] is not None: + token['deps'] = None + word_id += 1 + if token['upos'] == ref_token['upos']: + matches_with_ref_pos += 1 + if token['feats'] == ref_token['feats']: + matches_with_ref_feats += 1 + if token['lemma'] == ref_token['lemma']: + matches_with_ref_lemma += 1 + if token['upos'] == ref_token['upos'] and \ + token['feats'] == ref_token['feats'] and \ + token['lemma'] == ref_token['lemma']: + matches_complete += 1 + + per_pos = matches_with_ref_pos*100.0/word_id + per_feats = matches_with_ref_feats*100.0/word_id + per_lemma = matches_with_ref_lemma*100.0/word_id + per_comp = matches_complete*100.0/word_id + print('seed:', seed, f'| upos matches: {matches_with_ref_pos}/{word_id} ({per_pos:.2f}%) |'+\ + f' feats matches: {matches_with_ref_feats}/{word_id} ({per_feats:.2f}%) |'+\ + f' lemma matches: {matches_with_ref_lemma}/{word_id} ({per_lemma:.2f}%) |'+\ + f' complete matches: {matches_complete}/{word_id} ({per_comp:.2f}%) |') + + +def convert_and_compare_against_all_references( in_dir, morph_pipeline, ref_dirs, ref_skip_list=[], seed=43 ): + ''' + Reannotates CONLL files in `in_dir` with given `morph_pipeline` and + compares results against CONLL files in `ref_dirs`. Outputs results of comparison. + + Outputs from the last runs: + + 1) Results if upos is taken from randomly chosen annotation (default setting): + Conversion target: 'et_edt-ud-dev-morph_extended.conllu', morph layer: 'morph_extended' ... + seed: 43 | upos matches: 42801/44686 (95.78%) | feats matches: 44686/44686 (100.00%) | lemma matches: 44686/44686 (100.00%) | complete matches: 42801/44686 (95.78%) | + Conversion target: 'et_edt-ud-test-morph_extended.conllu', morph layer: 'morph_extended' ... + seed: 43 | upos matches: 46742/48532 (96.31%) | feats matches: 48530/48532 (100.00%) | lemma matches: 48532/48532 (100.00%) | complete matches: 46740/48532 (96.31%) | + Conversion target: 'et_edt-ud-train-morph_extended.conllu', morph layer: 'morph_extended' ... + seed: 43 | upos matches: 331461/344953 (96.09%) | feats matches: 343263/344953 (99.51%) | lemma matches: 344953/344953 (100.00%) | complete matches: 330010/344953 (95.67%) | + + Conversion target: 'et_edt-ud-dev-morph_analysis.conllu', morph layer: 'morph_analysis' ... + seed: 43 | upos matches: 39868/44686 (89.22%) | feats matches: 44686/44686 (100.00%) | lemma matches: 44686/44686 (100.00%) | complete matches: 39868/44686 (89.22%) | + Conversion target: 'et_edt-ud-test-morph_analysis.conllu', morph layer: 'morph_analysis' ... + seed: 43 | upos matches: 43338/48532 (89.30%) | feats matches: 48530/48532 (100.00%) | lemma matches: 48532/48532 (100.00%) | complete matches: 43338/48532 (89.30%) | + Conversion target: 'et_edt-ud-train-morph_analysis.conllu', morph layer: 'morph_analysis' ... + seed: 43 | upos matches: 308432/344953 (89.41%) | feats matches: 344920/344953 (99.99%) | lemma matches: 344953/344953 (100.00%) | complete matches: 308425/344953 (89.41%) | + + 2) Results if upos/xpos is always taken from the first annotation: + Conversion target: 'et_edt-ud-dev-morph_extended.conllu', morph layer: 'morph_extended' ... + seed: 43 | upos matches: 43259/44686 (96.81%) | feats matches: 44686/44686 (100.00%) | lemma matches: 44686/44686 (100.00%) | complete matches: 43259/44686 (96.81%) | + Conversion target: 'et_edt-ud-test-morph_extended.conllu', morph layer: 'morph_extended' ... + seed: 43 | upos matches: 47169/48532 (97.19%) | feats matches: 48530/48532 (100.00%) | lemma matches: 48532/48532 (100.00%) | complete matches: 47167/48532 (97.19%) | + Conversion target: 'et_edt-ud-train-morph_extended.conllu', morph layer: 'morph_extended' ... + seed: 43 | upos matches: 335582/344953 (97.28%) | feats matches: 343263/344953 (99.51%) | lemma matches: 344953/344953 (100.00%) | complete matches: 334002/344953 (96.83%) | + + Conversion target: 'et_edt-ud-dev-morph_analysis.conllu', morph layer: 'morph_analysis' ... + seed: 43 | upos matches: 40390/44686 (90.39%) | feats matches: 44686/44686 (100.00%) | lemma matches: 44686/44686 (100.00%) | complete matches: 40390/44686 (90.39%) | + Conversion target: 'et_edt-ud-test-morph_analysis.conllu', morph layer: 'morph_analysis' ... + seed: 43 | upos matches: 43902/48532 (90.46%) | feats matches: 48530/48532 (100.00%) | lemma matches: 48532/48532 (100.00%) | complete matches: 43902/48532 (90.46%) | + Conversion target: 'et_edt-ud-train-morph_analysis.conllu', morph layer: 'morph_analysis' ... + seed: 43 | upos matches: 313489/344953 (90.88%) | feats matches: 344920/344953 (99.99%) | lemma matches: 344953/344953 (100.00%) | complete matches: 313482/344953 (90.88%) | + ''' + start = datetime.now() + ref_conllu_file_name_pat = re.compile('^(\S+)-([^\-]+)\.conllu$') + assert os.path.isdir( in_dir ) + for ref_dir in ref_dirs: + assert os.path.isdir( ref_dir ) + conllu_files_found = 0 + for ref_fname in os.listdir( ref_dir ): + if ref_fname in ref_skip_list: + continue + m = ref_conllu_file_name_pat.match(ref_fname) + if m: + ref_fpath = os.path.join(ref_dir, ref_fname) + input_file_prefix = m.group(1) + morph_layer_name = m.group(2) + # Detect input file + in_fpath = None + for in_fname in os.listdir( in_dir ): + if in_fname.startswith(input_file_prefix) and in_fname.endswith('.conllu'): + in_fpath = os.path.join(in_dir, in_fname) + if in_fpath is None: + raise FileNotFoundError( ('(!) Unable to find file with '+\ + 'prefix {!r} from dir {!r}.').format(input_file_prefix, + in_dir)) + print(f'Conversion target: {ref_fname!r}, morph layer: {morph_layer_name!r} ...') + convert_and_compare_against_reference( in_fpath, morph_pipeline, + morph_layer_name, ref_fpath, + seed=seed) + print(f'Total processing time: {datetime.now()-start}') + + + +if __name__ == '__main__': + if len(sys.argv) < 2: + raise Exception('(!) Missing input argument: name of the configuration INI file.') + # Try to execute all input files as configurations + for conf_file in sys.argv[1:]: + convert_to_estnltk_conllu_main( conf_file ) diff --git a/01b_extract_clauses.py b/01b_extract_clauses.py new file mode 100644 index 00000000..896c82ab --- /dev/null +++ b/01b_extract_clauses.py @@ -0,0 +1,109 @@ +# +# Splits sentences in CONLLU files into clauses (with EstNLTK's ClauseTagger). +# Cleans clauses (removes conjunctions and punctuation at the beginning or at +# the end of the clause), and exports cleaned clauses as CONLLU files. +# +# Requires estnltk 1.7.2+ +# + +import os +import sys +import configparser +from datetime import datetime + +from estnltk.converters.conll.conll_importer import conll_to_text + +from syntax_sketches.syntax_sketch import clean_clause +from syntax_sketches.clause_export import export_cleaned_clause + +def extract_clauses( conf_file ): + ''' + Splits sentences in CONLLU files into clauses with EstNLTK, + cleans clauses and saves as new CONLLU files. + Inputs/outputs and parameters of the processing will be read + from the given `conf_file`. + Executes sections in the configuration starting with prefix + 'extract_clauses_'. + ''' + # Parse configuration file + config = configparser.ConfigParser() + if conf_file is None or not os.path.exists(conf_file): + raise FileNotFoundError("Config file {} does not exist".format(conf_file)) + if len(config.read(conf_file)) != 1: + raise ValueError("File {} is not accessible or is not in valid INI format".format(conf_file)) + section_found = False + start = datetime.now() + for section in config.sections(): + if section.startswith('extract_clauses_'): + section_found = True + print(f'Performing {section} ...') + # Collect clause tagging parameters + if not config.has_option(section, 'input_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "input_dir" parameter.') + input_dir = config[section]['input_dir'] + if not os.path.isdir(input_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "input_dir" value {input_dir!r} in {section!r}.') + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_dir = config[section]['output_dir'] + if input_dir == output_dir: + raise ValueError(f'Error in {conf_file}: section {section!r} "output_dir" cannot be same as "input_dir".') + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + remove_empty_nodes = config[section].getboolean('remove_empty_nodes', True) + skip_list = config[section].get('skip_list', '') + if len(skip_list) > 0: + skip_list = skip_list.split(',') + skip_list = [fname.strip() for fname in skip_list] + for fname in os.listdir(input_dir): + if fname in skip_list: + continue + fpath = os.path.join(input_dir, fname) + if os.path.isfile(fpath) and fname.endswith('.conllu'): + text_obj = conll_to_text( fpath, 'ud_syntax', remove_empty_nodes=remove_empty_nodes ) + print('Tagging clauses in: ', fname) + text_obj.tag_layer('clauses') + expected_layers = { + 'clauses', 'compound_tokens', 'morph_analysis', + 'sentences', 'tokens', 'ud_syntax', 'words' + } + assert text_obj.layers == expected_layers, 'Unexpected layers' + + print('Writing out results to: ', output_dir) + output_fname = os.path.join(output_dir, fname) + valid_clauses = 0 + invalid_clauses = 0 + output_file = open(output_fname, 'wt', encoding='utf-8') + for clause in text_obj.clauses: + cleaned_clause = clean_clause(clause) + + if len(cleaned_clause['root_loc']) != 1: + invalid_clauses += 1 + continue + + if valid_clauses > 0: + output_file.write('\n\n') + + output_file.write(export_cleaned_clause(cleaned_clause)) + valid_clauses += 1 + + # Add final empty line (to avoid UDError: The CoNLL-U file does not end with empty line) + output_file.write('\n\n') + + print('Valid clauses: {}'.format(valid_clauses)) + print('Invalid clauses: {} (missing root)'.format(invalid_clauses)) + output_file.close() + if section_found: + print(f'Total processing time: {datetime.now()-start}') + else: + print(f'No section starting with "extract_clauses_" in {conf_file}.') + + +if __name__ == '__main__': + if len(sys.argv) < 2: + raise Exception('(!) Missing input argument: name of the configuration INI file.') + # Try to execute all input files as configurations + for conf_file in sys.argv[1:]: + extract_clauses( conf_file ) + + diff --git a/01c_analyse_sketches.ipynb b/01c_analyse_sketches.ipynb new file mode 100644 index 00000000..a40781ac --- /dev/null +++ b/01c_analyse_sketches.ipynb @@ -0,0 +1,808 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7d65d63a", + "metadata": {}, + "source": [ + "# Analyse syntax sketches" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "eb614e41", + "metadata": {}, + "outputs": [], + "source": [ + "import os, os.path\n", + "import conllu\n", + "from statistics import median\n", + "from collections import Counter, OrderedDict\n", + "\n", + "from numpy import argmax \n", + "from numpy import arange\n", + "from numpy import array\n", + "from numpy import cumsum\n", + "from numpy import quantile\n", + "from pandas import DataFrame\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from syntax_sketches.syntax_sketch import compute_sketches\n", + "from syntax_sketches.deprel_seq import collect_deprel_seqs" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "611f6180", + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration\n", + "# Input directory (must contain conllu files with full sentences)\n", + "input_dir_full_sentences = 'edt_2.6\\\\preannotated\\\\morph_extended'\n", + "assert os.path.isdir(input_dir_full_sentences)\n", + "\n", + "# Input directory (must contain conllu files with sentences split into clauses)\n", + "input_dir = 'edt_2.6\\\\preannotated\\\\morph_extended_clauses'\n", + "assert os.path.isdir(input_dir)\n", + "\n", + "# Save top 50 sketches for knock-out experiments into this file\n", + "knockout_list_file = 'edt_2.6\\\\preannotated\\\\morph_extended_clauses\\\\top_50_sketches.csv'" + ] + }, + { + "cell_type": "markdown", + "id": "ece9c322", + "metadata": {}, + "source": [ + "## [Prelude] A quick study of full sentence deprel sequences" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8e7e2a21", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "et_edt-ud-dev-morph_extended.conllu | #sentences/deprel_sequences: 3125\n", + "et_edt-ud-test-morph_extended.conllu | #sentences/deprel_sequences: 3214\n", + "et_edt-ud-train-morph_extended.conllu | #sentences/deprel_sequences: 24633\n", + "\n", + "Dependency relation sequences occurring only once: 85.96%\n" + ] + } + ], + "source": [ + "all_deprel_sequences = []\n", + "for fname in os.listdir(input_dir_full_sentences):\n", + " if fname.endswith('.conllu') and fname != 'train_full.conllu':\n", + " deprel_seqs = collect_deprel_seqs(os.path.join(input_dir_full_sentences, fname))\n", + " print(fname, '| #sentences/deprel_sequences: ', len(deprel_seqs))\n", + " all_deprel_sequences.extend(deprel_seqs)\n", + "\n", + "print()\n", + "deprel_seq_counter = Counter(all_deprel_sequences)\n", + "singleton_deprel_seqs = sum(array(list(deprel_seq_counter.values())) == 1)\n", + "print('Dependency relation sequences occurring only once: {:.2f}%'.format(singleton_deprel_seqs/len(all_deprel_sequences) * 100))" + ] + }, + { + "cell_type": "markdown", + "id": "b79e236a", + "metadata": {}, + "source": [ + "## I. Compute sketches for the whole corpus" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0cb83f70", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "et_edt-ud-dev-morph_extended.conllu | #clauses: 5708\n", + "et_edt-ud-test-morph_extended.conllu | #clauses: 6033\n", + "et_edt-ud-train-morph_extended.conllu | #clauses: 43966\n", + "\n", + "#clauses total: 55707\n" + ] + } + ], + "source": [ + "sketches, clauses_count_total = compute_sketches(input_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "daf8e157", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "55707" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(sketches)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fd843d42", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['[V]obl(L)',\n", + " '[V]nsubj(L)obj(P)obl(L)obl(L)',\n", + " '[V]mark(L)nsubj(P)',\n", + " '[S]advmod(L)aux(L)nsubj(L)obj(L)',\n", + " '[V]aux(L)nsubj(L)',\n", + " '[S]amod(L)cop(L)nsubj:cop(L)obl(P)',\n", + " '[V]advmod(L)nsubj(L)obl(P)',\n", + " '[V]nsubj(ÜP)obj(L)',\n", + " '[S]amod(L)cop(L)nmod(L)nsubj:cop(L)',\n", + " '[V]advmod(L)nsubj(P)obl(L)']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# list of sketches for all clauses\n", + "sketches[:10]" + ] + }, + { + "cell_type": "markdown", + "id": "d70ed01d", + "metadata": {}, + "source": [ + "Sketch name encoding:\n", + "* [ clause_root_postag ] -- V (verb), S (substantive, adjective, pronoun, abbreviation or numeral), X (other postag);\n", + " * first_level_child_deprel ( first_level_child_subtree_size* )\n", + " * (L) -- short subtree: up to two nodes;\n", + " * (P) -- long subtree: three to nine nodes;\n", + " * (ÜP) -- extra long subtree: ten or more nodes;\n", + " * \\* subtree size includes the first level child node itself;" + ] + }, + { + "cell_type": "markdown", + "id": "94756829", + "metadata": {}, + "source": [ + "## II. Analyse distribution of syntax sketches" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "67b7a0c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of valid clauses: 55707\n", + "Number of different sketches: 7925\n", + "Expected sketch support: 7.03\n", + "Median sketch support: 1.00\n", + "Singleton sketches count: 4954\n", + "Singleton sketches: 62.51%\n", + "Non-singleton sketches: 37.49%\n" + ] + } + ], + "source": [ + "sketch_counter = Counter(sketches)\n", + "print('Number of valid clauses: {}'.format(clauses_count_total))\n", + "print('Number of different sketches: {}'.format(len(sketch_counter)))\n", + "print('Expected sketch support: {:.2f}'.format(clauses_count_total/len(sketch_counter)))\n", + "print('Median sketch support: {:.2f}'.format(median(sketch_counter.values())))\n", + "print('Singleton sketches count: {:}'.format(sum(array(list(sketch_counter.values())) == 1)))\n", + "print('Singleton sketches: {:.2f}%'.format(sum(array(list(sketch_counter.values())) == 1)/len(sketch_counter.values()) * 100))\n", + "print('Non-singleton sketches: {:.2f}%'.format(sum(array(list(sketch_counter.values())) > 1)/len(sketch_counter.values()) * 100))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1ddbff0f", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "qtls = arange(0, 1.1, 0.1)\n", + "plt.scatter(x=qtls, y=quantile(list(sketch_counter.values()),q=qtls))\n", + "plt.title('Quantiles for sketch supports')\n", + "plt.xlabel('Quantile')\n", + "plt.ylabel('Sketch support')\n", + "plt.yscale('log')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "07291848", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Top sketchesSupport
0[V]nsubj(L)1918
1[S]1891
2[V]nsubj(L)obl(L)979
3[V]883
4[S]cop(L)nsubj:cop(L)853
5[V]nsubj(L)obj(L)760
6[V]advmod(L)nsubj(L)663
7[V]obj(L)594
8[V]obl(L)557
9[V]nsubj(L)obj(L)obl(L)534
10[V]obj(L)obl(L)489
11[S]advmod(L)cop(L)nsubj:cop(L)485
12[V]nsubj(L)obl(P)484
13[V]nsubj(P)477
14[V]advmod(L)nsubj(L)obl(L)476
15[S]nmod(L)423
16[V]nsubj(P)obl(L)418
17[V]nsubj(L)obj(P)396
18[V]advmod(L)nsubj(L)obj(L)366
19[S]cop(L)nsubj:cop(P)365
20[V]advmod(L)357
21[V]nsubj(L)obl(L)obl(L)338
22[X]311
23[V]nsubj(L)xcomp(P)305
24[V]obj(P)291
25[V]aux(L)nsubj(L)283
26[S]cop(L)nsubj:cop(L)obl(L)267
27[S]nummod(L)264
28[V]nsubj(L)xcomp(L)258
29[V]nsubj(L)obl(L)obl(P)248
30[V]advmod(L)obj(L)242
31[V]nsubj(P)obj(L)236
32[V]aux(L)nsubj(L)obj(L)234
33[V]xcomp(P)217
34[S]amod(L)cop(L)nsubj:cop(L)213
35[V]nsubj(P)obl(P)212
36[V]obl(P)207
37[V]aux(L)nsubj(L)obl(L)206
38[V]advmod(L)obl(L)203
39[S]amod(L)203
40[V]obj(P)obl(L)195
41[V]nsubj(L)obj(P)obl(L)189
42[S]cop(L)nmod(L)nsubj:cop(L)188
43[V]nsubj(L)obj(L)obl(P)187
44[V]advmod(L)nsubj(L)obj(L)obl(L)185
45[V]advmod(L)nsubj(L)obl(P)184
46[S]advmod(L)177
47[V]obj(L)obl(P)177
48[V]advmod(L)nsubj(P)173
49[V]aux(L)173
\n", + "
" + ], + "text/plain": [ + " Top sketches Support\n", + "0 [V]nsubj(L) 1918\n", + "1 [S] 1891\n", + "2 [V]nsubj(L)obl(L) 979\n", + "3 [V] 883\n", + "4 [S]cop(L)nsubj:cop(L) 853\n", + "5 [V]nsubj(L)obj(L) 760\n", + "6 [V]advmod(L)nsubj(L) 663\n", + "7 [V]obj(L) 594\n", + "8 [V]obl(L) 557\n", + "9 [V]nsubj(L)obj(L)obl(L) 534\n", + "10 [V]obj(L)obl(L) 489\n", + "11 [S]advmod(L)cop(L)nsubj:cop(L) 485\n", + "12 [V]nsubj(L)obl(P) 484\n", + "13 [V]nsubj(P) 477\n", + "14 [V]advmod(L)nsubj(L)obl(L) 476\n", + "15 [S]nmod(L) 423\n", + "16 [V]nsubj(P)obl(L) 418\n", + "17 [V]nsubj(L)obj(P) 396\n", + "18 [V]advmod(L)nsubj(L)obj(L) 366\n", + "19 [S]cop(L)nsubj:cop(P) 365\n", + "20 [V]advmod(L) 357\n", + "21 [V]nsubj(L)obl(L)obl(L) 338\n", + "22 [X] 311\n", + "23 [V]nsubj(L)xcomp(P) 305\n", + "24 [V]obj(P) 291\n", + "25 [V]aux(L)nsubj(L) 283\n", + "26 [S]cop(L)nsubj:cop(L)obl(L) 267\n", + "27 [S]nummod(L) 264\n", + "28 [V]nsubj(L)xcomp(L) 258\n", + "29 [V]nsubj(L)obl(L)obl(P) 248\n", + "30 [V]advmod(L)obj(L) 242\n", + "31 [V]nsubj(P)obj(L) 236\n", + "32 [V]aux(L)nsubj(L)obj(L) 234\n", + "33 [V]xcomp(P) 217\n", + "34 [S]amod(L)cop(L)nsubj:cop(L) 213\n", + "35 [V]nsubj(P)obl(P) 212\n", + "36 [V]obl(P) 207\n", + "37 [V]aux(L)nsubj(L)obl(L) 206\n", + "38 [V]advmod(L)obl(L) 203\n", + "39 [S]amod(L) 203\n", + "40 [V]obj(P)obl(L) 195\n", + "41 [V]nsubj(L)obj(P)obl(L) 189\n", + "42 [S]cop(L)nmod(L)nsubj:cop(L) 188\n", + "43 [V]nsubj(L)obj(L)obl(P) 187\n", + "44 [V]advmod(L)nsubj(L)obj(L)obl(L) 185\n", + "45 [V]advmod(L)nsubj(L)obl(P) 184\n", + "46 [S]advmod(L) 177\n", + "47 [V]obj(L)obl(P) 177\n", + "48 [V]advmod(L)nsubj(P) 173\n", + "49 [V]aux(L) 173" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(DataFrame({\n", + " 'Top sketches': [sketch for sketch, _ in sketch_counter.most_common(50)],\n", + " 'Support': [count for _, count in sketch_counter.most_common(50)] \n", + "}))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "31de9e87", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top 50 sketches coverage: 37.58%\n" + ] + } + ], + "source": [ + "# Coverage of the 50 most frequent sketches\n", + "print('Top 50 sketches coverage: {:.2f}%'.format(sum([count for _, count in sketch_counter.most_common(50)])/clauses_count_total * 100))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d052c52b", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "coverage = cumsum(sorted(list(sketch_counter.values()), reverse=True))\n", + "coverage = coverage/coverage[-1] * 100\n", + "\n", + "plt.plot(range(1, len(coverage)+1), coverage, color='blue')\n", + "plt.axhline(y=70, color='r', linestyle='--')\n", + "plt.axvline(x=argmax(coverage >= 70) + 1, color='r', linestyle='--')\n", + "plt.title('Total coverage of most frequent sketches')\n", + "plt.xlabel('Number of sketches')\n", + "plt.ylabel('Total coverage')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5c75ecff", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAHFCAYAAADv8c1wAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy88F64QAAAACXBIWXMAAA9hAAAPYQGoP6dpAABG3klEQVR4nO3dfVwVdf7//+cR5IgIZ0EEJNHMC5JQ27RVtPIaNFHTysqW1c1FN1MzZStrS3I3Na/bdddcUykvwt28yL4koZluJnjBRkqauaWJN0FM4aDEAuL8/ujj/DyiNhh0UB/3221uN87Ma2ZeZzjJs/dcHJthGIYAAABwVXXc3QAAAMD1gNAEAABgAaEJAADAAkITAACABYQmAAAACwhNAAAAFhCaAAAALCA0AQAAWEBoAgAAsIDQBNSgpKQk2Ww2c/L09FTjxo316KOP6tChQzW67/379ysxMVFHjhypUq9W6y+WmJgom82m7777rsrr/tg2r3dV/T1YMWLECN16663Vtj2rbDabEhMTf/b9ArUFoQn4GSxbtkzp6enavHmzxo4dqw0bNuiee+5RQUFBje1z//79euWVV6r1jzWqriZ+Dy+99JLWrVtXbdsDYI2nuxsAbgaRkZHq2LGjJKl79+6qqKjQlClTtH79ev32t791c3e43rRo0cLdLQA3JUaaADe4EKBOnDjhMn/Dhg2KiopS/fr15evrqz59+ig9Pb3S+tu3b1evXr3k6+ur+vXrq0uXLkpJSTGXJyUl6eGHH5Yk9ejRwzw9mJSUVKU+N23apEGDBqlJkyaqV6+eWrZsqdGjR1/xNFxOTo6GDBkiPz8/ORwO/frXv9bJkycr1a1evVpRUVHy8fFRgwYNFBMTo88++6xKvV1s586dGjBggBo2bKh69eqpRYsWmjBhgkvNjx0z6cqnBC936vLWW29VbGysUlNTddddd8nb21u33367li5d6rLe1X4Pn332mWJjYxUUFCS73a7Q0FD1799fx44du+r7vdzpOZvNprFjx2r58uVq06aN6tevr/bt2+v//b//9yNH7weFhYWaNGmSbrvtNtntdgUFBen+++/Xl19+ecV1Tp48qTFjxigiIkINGjRQUFCQevbsqU8++cSlbuvWrbLZbNq6davL/CNHjlT6XH7zzTd69NFHFRoaKrvdruDgYPXq1UtZWVku61r5DFndFmAVoQlwg8OHD0uSWrdubc5btWqVBg0aJD8/P73zzjtasmSJCgoK1L17d23fvt2s27Ztm3r27Cmn06klS5bonXfeka+vrwYMGKDVq1dLkvr3769p06ZJkv72t78pPT1d6enp6t+/f5X6/PrrrxUVFaWFCxcqLS1NL7/8snbu3Kl77rlH5eXlleoHDx6sli1b6t1331ViYqLWr1+vmJgYl9pp06bpscceU0REhP75z39q+fLlOnPmjO69917t37+/Sv1J0ocffqh7771XR48e1dy5c7Vx40b98Y9/dAmkVo7Ztfj88881adIkPfPMM3rvvffUrl07jRw5Uv/+978lXf33UFxcrD59+ujEiRP629/+pk2bNmn+/Plq2rSpzpw5c039pKSkaMGCBZo6darWrFmjgIAADR48WN98881V1ztz5ozuueceLVq0SL/97W/1/vvv64033lDr1q2Vm5t7xfVOnz4tSZoyZYpSUlK0bNky3XbbberevXulgGTV/fffr8zMTM2cOVObNm3SwoUL9ctf/lKFhYVmjdXPkJVtAVViAKgxy5YtMyQZGRkZRnl5uXHmzBkjNTXVCAkJMe677z6jvLzcMAzDqKioMEJDQ422bdsaFRUV5vpnzpwxgoKCjC5dupjzOnfubAQFBRlnzpwx5507d86IjIw0mjRpYpw/f94wDMP417/+ZUgyPv744yr1evjw4csuP3/+vFFeXm58++23hiTjvffeM5dNmTLFkGQ888wzLuusXLnSkGSsWLHCMAzDOHr0qOHp6WmMGzfOpe7MmTNGSEiIMXTo0Erb/DEtWrQwWrRoYZSUlFyxxuoxu9I+L3dsmjVrZtSrV8/49ttvzXklJSVGQECAMXr0aHPelX4Pe/bsMSQZ69ev/9H3eKnhw4cbzZo1c5knyQgODjaKiorMeXl5eUadOnWM6dOnX3V7U6dONSQZmzZtumqdJGPKlClXXH7u3DmjvLzc6NWrlzF48GBz/scff3zZY3D48GFDkrFs2TLDMAzju+++MyQZ8+fPv+I+rH6GrGwLqCpGmoCfQefOnVW3bl35+vqqb9++8vf313vvvSdPzx8uKzx48KCOHz+uuLg41anz//9n2aBBAz344IPKyMjQ999/r+LiYu3cuVMPPfSQGjRoYNZ5eHgoLi5Ox44d08GDB6ut7/z8fP3+979XWFiYPD09VbduXTVr1kySdODAgUr1jz/+uMvroUOHytPTUx9//LGkH0aFzp07p9/85jc6d+6cOdWrV0/dunWr8ujEV199pa+//lojR45UvXr1LltTk8fszjvvVNOmTc3X9erVU+vWrfXtt9/+6LotW7aUv7+/nnvuOb3xxhvXNMp2qR49esjX19d8HRwcrKCgoB/tZ+PGjWrdurV69+5d5X2+8cYbuuuuu1SvXj3zM/LRRx9d9vPxYwICAtSiRQvNmjVLc+fO1Weffabz58+71Fj9DFnZFlBVhCbgZ/D2229r9+7d2rJli0aPHq0DBw7oscceM5efOnVKktS4ceNK64aGhur8+fMqKChQQUGBDMO4Yt3F2/qpzp8/r+joaK1du1bPPvusPvroI+3atUsZGRmSpJKSkkrrhISEuLz29PRUw4YNzZ4unDK7++67VbduXZdp9erVVX5kwYXrpZo0aXLFmpo8Zg0bNqw0z263X/bYXMrhcGjbtm2688479cILL+iOO+5QaGiopkyZctlTnzXZz8mTJ696DK9k7ty5evLJJ9WpUyetWbNGGRkZ2r17t/r27WvpGFzKZrPpo48+UkxMjGbOnKm77rpLjRo10vjx481TllY/Q1a2BVQVd88BP4M2bdqYF3/36NFDFRUVevPNN/Xuu+/qoYceMv/YXe76kePHj6tOnTry9/eXYRiqU6fOFeskKTAwsFp6zs7O1ueff66kpCQNHz7cnP/f//73iuvk5eXplltuMV+fO3dOp06dMt/fhd7effddc8Tqp2jUqJEkXfXCaX9/f8vH7MJoVWlpqex2u1lXnc+fuljbtm2VnJwswzC0d+9eJSUlaerUqfL29tbzzz9fI/u8nEaNGv3oxeeXs2LFCnXv3l0LFy50mX9pKLn4uF7scse1WbNmWrJkiaQfRhL/+c9/KjExUWVlZXrjjTeq9Bn6sW0BVcVIE+AGM2fOlL+/v15++WWdP39e4eHhuuWWW7Rq1SoZhmHWFRcXa82aNeYddT4+PurUqZPWrl3r8n/y58+f14oVK9SkSRPz4vILf/Sv5f/4JZl3kV0cHiRp0aJFV1xn5cqVLq//+c9/6ty5c+revbskKSYmRp6envr666/VsWPHy05V0bp1a7Vo0UJLly6t9Af5gqocswt3pO3du9dlG++//36V+rqYld+DzWZT+/btNW/ePP3iF7/Qf/7zn2ve37Xo16+fvvrqK23ZsqVK69lstkqfj71791a64/NKx3XDhg1X3X7r1q31xz/+UW3btjWPybV+hi63LaCqGGkC3MDf31+TJ0/Ws88+q1WrVunXv/61Zs6cqccff1yxsbEaPXq0SktLNWvWLBUWFmrGjBnmutOnT1efPn3Uo0cPJSQkyMvLS3//+9+VnZ2td955xww7kZGRkqR//OMf8vX1Vb169dS8efPLnsK5nNtvv10tWrTQ888/L8MwFBAQoPfff1+bNm264jpr166Vp6en+vTpoy+++EIvvfSS2rdvr6FDh0r64Y/n1KlT9eKLL+qbb74xr+86ceKEdu3aJR8fH73yyitVOpZ/+9vfNGDAAHXu3FnPPPOMmjZtqqNHj+rDDz80Q5zVY3b//fcrICBAI0eO1NSpU+Xp6amkpCTl5ORUqaeLXen3kJ6err///e964IEHdNttt8kwDK1du1aFhYXq06fPNe/vWkyYMEGrV6/WoEGD9Pzzz+tXv/qVSkpKtG3bNsXGxqpHjx6XXS82NlZ/+tOfNGXKFHXr1k0HDx7U1KlT1bx5c507d86sCwkJUe/evTV9+nT5+/urWbNm+uijj7R27VqX7e3du1djx47Vww8/rFatWsnLy0tbtmzR3r17zZE3q58hK9sCqsyNF6EDN7wLd13t3r270rKSkhKjadOmRqtWrYxz584ZhmEY69evNzp16mTUq1fP8PHxMXr16mV8+umnldb95JNPjJ49exo+Pj6Gt7e30blzZ+P999+vVDd//nyjefPmhoeHh8tdSlfr9eI7xPbv32/06dPH8PX1Nfz9/Y2HH37YOHr0aKW7qC7cdZaZmWkMGDDAaNCggeHr62s89thjxokTJyrta/369UaPHj0MPz8/w263G82aNTMeeughY/PmzZW2aUV6errRr18/w+FwGHa73WjRokWlO/msHrNdu3YZXbp0MXx8fIxbbrnFmDJlivHmm29e9u65/v37V1q/W7duRrdu3VzmXe738OWXXxqPPfaY0aJFC8Pb29twOBzGr371KyMpKelH3++V7p576qmnKtU2a9bMGD58+I9us6CgwHj66aeNpk2bGnXr1jWCgoKM/v37G19++aXLPi7+vZeWlhoJCQnGLbfcYtSrV8+46667jPXr11+2v9zcXOOhhx4yAgICDIfDYfz617827yC88Lk8ceKEMWLECOP22283fHx8jAYNGhjt2rUz5s2bZ/43csGPfYaqsi3AKpthXHQuAAAAAJfFNU0AAAAWEJoAAAAsIDQBAABYQGgCAACwgNAEAABgAaEJAADAAh5uWY3Onz+v48ePy9fX13xYHgAAqN0Mw9CZM2cUGhrq8qXplyI0VaPjx48rLCzM3W0AAIBrkJOTc9UvryY0VSNfX19JPxx0Pz8/N3cDAACsKCoqUlhYmPl3/EoITdXowik5Pz8/QhMAANeZH7u0hgvBAQAALCA0AQAAWEBoAgAAsIDQBAAAYAGhCQAAwAJCEwAAgAWEJgAAAAsITQAAABYQmgAAACwgNAEAAFhAaAIAALCA0AQAAGABoQkAAMACQhMAAIAFhCYAAAALPN3dAKy59fkUd7cANzsyo7+7WwCAmxojTQAAABYQmgAAACwgNAEAAFhAaAIAALCA0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsIDQBAABY4NbQtHDhQrVr105+fn7y8/NTVFSUNm7caC4fMWKEbDaby9S5c2eXbZSWlmrcuHEKDAyUj4+PBg4cqGPHjrnUFBQUKC4uTg6HQw6HQ3FxcSosLHSpOXr0qAYMGCAfHx8FBgZq/PjxKisrq7H3DgAAri9uDU1NmjTRjBkztGfPHu3Zs0c9e/bUoEGD9MUXX5g1ffv2VW5urjl98MEHLtuYMGGC1q1bp+TkZG3fvl1nz55VbGysKioqzJphw4YpKytLqampSk1NVVZWluLi4szlFRUV6t+/v4qLi7V9+3YlJydrzZo1mjRpUs0fBAAAcF2wGYZhuLuJiwUEBGjWrFkaOXKkRowYocLCQq1fv/6ytU6nU40aNdLy5cv1yCOPSJKOHz+usLAwffDBB4qJidGBAwcUERGhjIwMderUSZKUkZGhqKgoffnllwoPD9fGjRsVGxurnJwchYaGSpKSk5M1YsQI5efny8/Pz1LvRUVFcjgccjqdltex6tbnU6p1e7j+HJnR390tAMANyerf71pzTVNFRYWSk5NVXFysqKgoc/7WrVsVFBSk1q1bKz4+Xvn5+eayzMxMlZeXKzo62pwXGhqqyMhI7dixQ5KUnp4uh8NhBiZJ6ty5sxwOh0tNZGSkGZgkKSYmRqWlpcrMzLxiz6WlpSoqKnKZAADAjcntoWnfvn1q0KCB7Ha7fv/732vdunWKiIiQJPXr108rV67Uli1bNGfOHO3evVs9e/ZUaWmpJCkvL09eXl7y9/d32WZwcLDy8vLMmqCgoEr7DQoKcqkJDg52We7v7y8vLy+z5nKmT59uXiflcDgUFhZ27QcCAADUap7ubiA8PFxZWVkqLCzUmjVrNHz4cG3btk0RERHmKTdJioyMVMeOHdWsWTOlpKRoyJAhV9ymYRiy2Wzm64t//ik1l5o8ebImTpxovi4qKiI4AQBwg3L7SJOXl5datmypjh07avr06Wrfvr1ef/31y9Y2btxYzZo106FDhyRJISEhKisrU0FBgUtdfn6+OXIUEhKiEydOVNrWyZMnXWouHVEqKChQeXl5pRGoi9ntdvPOvwsTAAC4Mbk9NF3KMAzz9NulTp06pZycHDVu3FiS1KFDB9WtW1ebNm0ya3Jzc5Wdna0uXbpIkqKiouR0OrVr1y6zZufOnXI6nS412dnZys3NNWvS0tJkt9vVoUOHan+PAADg+uPW03MvvPCC+vXrp7CwMJ05c0bJycnaunWrUlNTdfbsWSUmJurBBx9U48aNdeTIEb3wwgsKDAzU4MGDJUkOh0MjR47UpEmT1LBhQwUEBCghIUFt27ZV7969JUlt2rRR3759FR8fr0WLFkmSRo0apdjYWIWHh0uSoqOjFRERobi4OM2aNUunT59WQkKC4uPjGT0CAACS3ByaTpw4obi4OOXm5srhcKhdu3ZKTU1Vnz59VFJSon379untt99WYWGhGjdurB49emj16tXy9fU1tzFv3jx5enpq6NChKikpUa9evZSUlCQPDw+zZuXKlRo/frx5l93AgQO1YMECc7mHh4dSUlI0ZswYde3aVd7e3ho2bJhmz5798x0MAABQq9W65zRdz3hOE2oSz2kCgJpx3T2nCQAAoDYjNAEAAFhAaAIAALCA0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsIDQBAABYQGgCAACwgNAEAABgAaEJAADAAkITAACABYQmAAAACwhNAAAAFhCaAAAALCA0AQAAWEBoAgAAsIDQBAAAYAGhCQAAwAJCEwAAgAWEJgAAAAsITQAAABYQmgAAACwgNAEAAFhAaAIAALCA0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsIDQBAABYQGgCAACwgNAEAABgAaEJAADAAreGpoULF6pdu3by8/OTn5+foqKitHHjRnO5YRhKTExUaGiovL291b17d33xxRcu2ygtLdW4ceMUGBgoHx8fDRw4UMeOHXOpKSgoUFxcnBwOhxwOh+Li4lRYWOhSc/ToUQ0YMEA+Pj4KDAzU+PHjVVZWVmPvHQAAXF/cGpqaNGmiGTNmaM+ePdqzZ4969uypQYMGmcFo5syZmjt3rhYsWKDdu3crJCREffr00ZkzZ8xtTJgwQevWrVNycrK2b9+us2fPKjY2VhUVFWbNsGHDlJWVpdTUVKWmpiorK0txcXHm8oqKCvXv31/FxcXavn27kpOTtWbNGk2aNOnnOxgAAKBWsxmGYbi7iYsFBARo1qxZeuKJJxQaGqoJEyboueeek/TDqFJwcLBee+01jR49Wk6nU40aNdLy5cv1yCOPSJKOHz+usLAwffDBB4qJidGBAwcUERGhjIwMderUSZKUkZGhqKgoffnllwoPD9fGjRsVGxurnJwchYaGSpKSk5M1YsQI5efny8/Pz1LvRUVFcjgccjqdltex6tbnU6p1e7j+HJnR390tAMANyerf71pzTVNFRYWSk5NVXFysqKgoHT58WHl5eYqOjjZr7Ha7unXrph07dkiSMjMzVV5e7lITGhqqyMhIsyY9PV0Oh8MMTJLUuXNnORwOl5rIyEgzMElSTEyMSktLlZmZecWeS0tLVVRU5DIBAIAbk9tD0759+9SgQQPZ7Xb9/ve/17p16xQREaG8vDxJUnBwsEt9cHCwuSwvL09eXl7y9/e/ak1QUFCl/QYFBbnUXLoff39/eXl5mTWXM336dPM6KYfDobCwsCq+ewAAcL1we2gKDw9XVlaWMjIy9OSTT2r48OHav3+/udxms7nUG4ZRad6lLq25XP211Fxq8uTJcjqd5pSTk3PVvgAAwPXL7aHJy8tLLVu2VMeOHTV9+nS1b99er7/+ukJCQiSp0khPfn6+OSoUEhKisrIyFRQUXLXmxIkTlfZ78uRJl5pL91NQUKDy8vJKI1AXs9vt5p1/FyYAAHBjcntoupRhGCotLVXz5s0VEhKiTZs2mcvKysq0bds2denSRZLUoUMH1a1b16UmNzdX2dnZZk1UVJScTqd27dpl1uzcuVNOp9OlJjs7W7m5uWZNWlqa7Ha7OnToUKPvFwAAXB883bnzF154Qf369VNYWJjOnDmj5ORkbd26VampqbLZbJowYYKmTZumVq1aqVWrVpo2bZrq16+vYcOGSZIcDodGjhypSZMmqWHDhgoICFBCQoLatm2r3r17S5LatGmjvn37Kj4+XosWLZIkjRo1SrGxsQoPD5ckRUdHKyIiQnFxcZo1a5ZOnz6thIQExcfHM3oEAAAkuTk0nThxQnFxccrNzZXD4VC7du2UmpqqPn36SJKeffZZlZSUaMyYMSooKFCnTp2UlpYmX19fcxvz5s2Tp6enhg4dqpKSEvXq1UtJSUny8PAwa1auXKnx48ebd9kNHDhQCxYsMJd7eHgoJSVFY8aMUdeuXeXt7a1hw4Zp9uzZP9ORAAAAtV2te07T9YznNKEm8ZwmAKgZ191zmgAAAGozQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsIDQBAABYQGgCAACwgNAEAABgAaEJAADAAkITAACABYQmAAAACwhNAAAAFhCaAAAALCA0AQAAWEBoAgAAsIDQBAAAYAGhCQAAwAJCEwAAgAWEJgAAAAsITQAAABYQmgAAACwgNAEAAFhAaAIAALCA0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsIDQBAABYQGgCAACwgNAEAABgAaEJAADAAkITAACABYQmAAAACwhNAAAAFhCaAAAALHBraJo+fbruvvtu+fr6KigoSA888IAOHjzoUjNixAjZbDaXqXPnzi41paWlGjdunAIDA+Xj46OBAwfq2LFjLjUFBQWKi4uTw+GQw+FQXFycCgsLXWqOHj2qAQMGyMfHR4GBgRo/frzKyspq5L0DAIDri1tD07Zt2/TUU08pIyNDmzZt0rlz5xQdHa3i4mKXur59+yo3N9ecPvjgA5flEyZM0Lp165ScnKzt27fr7Nmzio2NVUVFhVkzbNgwZWVlKTU1VampqcrKylJcXJy5vKKiQv3791dxcbG2b9+u5ORkrVmzRpMmTarZgwAAAK4Lnu7ceWpqqsvrZcuWKSgoSJmZmbrvvvvM+Xa7XSEhIZfdhtPp1JIlS7R8+XL17t1bkrRixQqFhYVp8+bNiomJ0YEDB5SamqqMjAx16tRJkrR48WJFRUXp4MGDCg8PV1pamvbv36+cnByFhoZKkubMmaMRI0bo1VdflZ+fX00cAgAAcJ2oVdc0OZ1OSVJAQIDL/K1btyooKEitW7dWfHy88vPzzWWZmZkqLy9XdHS0OS80NFSRkZHasWOHJCk9PV0Oh8MMTJLUuXNnORwOl5rIyEgzMElSTEyMSktLlZmZedl+S0tLVVRU5DIBAIAbU60JTYZhaOLEibrnnnsUGRlpzu/Xr59WrlypLVu2aM6cOdq9e7d69uyp0tJSSVJeXp68vLzk7+/vsr3g4GDl5eWZNUFBQZX2GRQU5FITHBzsstzf319eXl5mzaWmT59uXiPlcDgUFhZ27QcAAADUam49PXexsWPHau/evdq+fbvL/EceecT8OTIyUh07dlSzZs2UkpKiIUOGXHF7hmHIZrOZry/++afUXGzy5MmaOHGi+bqoqIjgBADADapWjDSNGzdOGzZs0Mcff6wmTZpctbZx48Zq1qyZDh06JEkKCQlRWVmZCgoKXOry8/PNkaOQkBCdOHGi0rZOnjzpUnPpiFJBQYHKy8srjUBdYLfb5efn5zIBAIAbk1tDk2EYGjt2rNauXastW7aoefPmP7rOqVOnlJOTo8aNG0uSOnTooLp162rTpk1mTW5urrKzs9WlSxdJUlRUlJxOp3bt2mXW7Ny5U06n06UmOztbubm5Zk1aWprsdrs6dOhQLe8XAABcv9x6eu6pp57SqlWr9N5778nX19cc6XE4HPL29tbZs2eVmJioBx98UI0bN9aRI0f0wgsvKDAwUIMHDzZrR44cqUmTJqlhw4YKCAhQQkKC2rZta95N16ZNG/Xt21fx8fFatGiRJGnUqFGKjY1VeHi4JCk6OloRERGKi4vTrFmzdPr0aSUkJCg+Pp4RJAAA4N6RpoULF8rpdKp79+5q3LixOa1evVqS5OHhoX379mnQoEFq3bq1hg8frtatWys9PV2+vr7mdubNm6cHHnhAQ4cOVdeuXVW/fn29//778vDwMGtWrlyptm3bKjo6WtHR0WrXrp2WL19uLvfw8FBKSorq1aunrl27aujQoXrggQc0e/bsn++AAACAWstmGIbh7iZuFEVFRXI4HHI6ndU+OnXr8ynVuj1cf47M6O/uFgDghmT173etuBAcAACgtiM0AQAAWEBoAgAAsIDQBAAAYEGVQ1PPnj1VWFhYaX5RUZF69uxZHT0BAADUOlUOTVu3blVZWVml+f/73//0ySefVEtTAAAAtY3lh1vu3bvX/Hn//v0uXzlSUVGh1NRU3XLLLdXbHQAAQC1hOTTdeeedstlsstlslz0N5+3trb/+9a/V2hwAAEBtYTk0HT58WIZh6LbbbtOuXbvUqFEjc5mXl5eCgoJcnsANAABwI7Ecmpo1ayZJOn/+fI01AwAAUFtd0xf2fvXVV9q6davy8/MrhaiXX365WhoDAACoTaocmhYvXqwnn3xSgYGBCgkJkc1mM5fZbDZCEwAAuCFVOTT9+c9/1quvvqrnnnuuJvoBAAColar8nKaCggI9/PDDNdELAABArVXl0PTwww8rLS2tJnoBAACotap8eq5ly5Z66aWXlJGRobZt26pu3bouy8ePH19tzQEAANQWNsMwjKqs0Lx58ytvzGbTN99885Obul4VFRXJ4XDI6XTKz8+vWrd96/Mp1bo9XH+OzOjv7hYA4IZk9e93lUeaDh8+/JMaAwAAuB5V+ZomAACAm1GVR5qeeOKJqy5funTpNTcDAABQW1U5NBUUFLi8Li8vV3Z2tgoLCy/7Rb4AAAA3giqHpnXr1lWad/78eY0ZM0a33XZbtTQFAABQ21TLNU116tTRM888o3nz5lXH5gAAAGqdarsQ/Ouvv9a5c+eqa3MAAAC1SpVPz02cONHltWEYys3NVUpKioYPH15tjQEAANQmVQ5Nn332mcvrOnXqqFGjRpozZ86P3lkHAABwvapyaPr4449rog8AAIBarcqh6YKTJ0/q4MGDstlsat26tRo1alSdfQEAANQqVb4QvLi4WE888YQaN26s++67T/fee69CQ0M1cuRIff/99zXRIwAAgNtVOTRNnDhR27Zt0/vvv6/CwkIVFhbqvffe07Zt2zRp0qSa6BEAAMDtqnx6bs2aNXr33XfVvXt3c979998vb29vDR06VAsXLqzO/gAAAGqFKo80ff/99woODq40PygoiNNzAADghlXl0BQVFaUpU6bof//7nzmvpKREr7zyiqKioqq1OQAAgNqiyqfnXn/9dfXt21dNmjRR+/btZbPZlJWVpXr16unDDz+siR4BAADcrsqhKTIyUocOHdKKFSv05ZdfyjAMPfroo3r88cfl7e1dEz0CAAC43TU9p8nb21vx8fHV3QsAAECtVeVrmqZPn66lS5dWmr906VK99tpr1dIUAABAbVPl0LRo0SLdfvvtlebfcccdeuONN6q0renTp+vuu++Wr6+vgoKC9MADD+jgwYMuNYZhKDExUaGhofL29lb37t31xRdfuNSUlpZq3LhxCgwMlI+PjwYOHKhjx4651BQUFCguLk4Oh0MOh0NxcXEqLCx0qTl69KgGDBggHx8fBQYGavz48SorK6vSewIAADemKoemvLw8NW7cuNL8Ro0aKTc3t0rb2rZtm5566illZGRo06ZNOnfunKKjo1VcXGzWzJw5U3PnztWCBQu0e/duhYSEqE+fPjpz5oxZM2HCBK1bt07Jycnavn27zp49q9jYWFVUVJg1w4YNU1ZWllJTU5WamqqsrCzFxcWZyysqKtS/f38VFxdr+/btSk5O1po1a3hgJwAAkHQN1zSFhYXp008/VfPmzV3mf/rppwoNDa3StlJTU11eL1u2TEFBQcrMzNR9990nwzA0f/58vfjiixoyZIgk6a233lJwcLBWrVql0aNHy+l0asmSJVq+fLl69+4tSVqxYoXCwsK0efNmxcTE6MCBA0pNTVVGRoY6deokSVq8eLGioqJ08OBBhYeHKy0tTfv371dOTo75PubMmaMRI0bo1VdflZ+fX1UPFQAAuIFUeaTpd7/7nSZMmKBly5bp22+/1bfffqulS5fqmWee+ckXhzudTklSQECAJOnw4cPKy8tTdHS0WWO329WtWzft2LFDkpSZmany8nKXmtDQUEVGRpo16enpcjgcZmCSpM6dO8vhcLjUREZGugS/mJgYlZaWKjMz87L9lpaWqqioyGUCAAA3piqPND377LM6ffq0xowZY17vU69ePT333HOaPHnyNTdiGIYmTpyoe+65R5GRkZJ+OBUoqdITyIODg/Xtt9+aNV5eXvL3969Uc2H9vLw8BQUFVdpnUFCQS82l+/H395eXl5dZc6np06frlVdeqepbBQAA16EqjzTZbDa99tprOnnypDIyMvT555/r9OnTevnll39SI2PHjtXevXv1zjvvXHafFzMMo9K8S11ac7n6a6m52OTJk+V0Os0pJyfnqj0BAIDrV5VD0wUNGjTQ3XffrcjISNnt9p/UxLhx47RhwwZ9/PHHatKkiTk/JCREkiqN9OTn55ujQiEhISorK1NBQcFVa06cOFFpvydPnnSpuXQ/BQUFKi8vv+x37Uk/nCr08/NzmQAAwI3pmkNTdTAMQ2PHjtXatWu1ZcuWSheXN2/eXCEhIdq0aZM5r6ysTNu2bVOXLl0kSR06dFDdunVdanJzc5WdnW3WREVFyel0ateuXWbNzp075XQ6XWqys7Nd7gBMS0uT3W5Xhw4dqv/NAwCA68o1PRG8ujz11FNatWqV3nvvPfn6+pojPQ6HQ97e3rLZbJowYYKmTZumVq1aqVWrVpo2bZrq16+vYcOGmbUjR47UpEmT1LBhQwUEBCghIUFt27Y176Zr06aN+vbtq/j4eC1atEiSNGrUKMXGxio8PFySFB0drYiICMXFxWnWrFk6ffq0EhISFB8fzwgSAABwb2hauHChJKl79+4u85ctW6YRI0ZI+uHC85KSEo0ZM0YFBQXq1KmT0tLS5Ovra9bPmzdPnp6eGjp0qEpKStSrVy8lJSXJw8PDrFm5cqXGjx9v3mU3cOBALViwwFzu4eGhlJQUjRkzRl27dpW3t7eGDRum2bNn19C7BwAA1xObYRiGu5u4URQVFcnhcMjpdFb76NStz6dU6/Zw/Tkyo7+7WwCAG5LVv9/XdE3T8uXL1bVrV4WGhpq3/s+fP1/vvffetXULAABQy1U5NC1cuFATJ07U/fffr8LCQvOrSn7xi19o/vz51d0fAABArVDl0PTXv/5Vixcv1osvvuhyzVDHjh21b9++am0OAACgtqhyaDp8+LB++ctfVppvt9tdvmgXAADgRlLl0NS8eXNlZWVVmr9x40ZFRERUR08AAAC1TpUfOfCHP/xBTz31lP73v//JMAzt2rVL77zzjqZPn64333yzJnoEAABwuyqHpt/+9rc6d+6cnn32WX3//fcaNmyYbrnlFr3++ut69NFHa6JHAAAAt7umh1vGx8crPj5e3333nc6fP6+goKDq7gsAAKBWqfI1TSUlJfr+++8lSYGBgSopKdH8+fOVlpZW7c0BAADUFlUOTYMGDdLbb78tSSosLNSvfvUrzZkzR4MGDTK/FgUAAOBGU+XQ9J///Ef33nuvJOndd99VSEiIvv32W7399tv6y1/+Uu0NAgAA1AZVDk3ff/+9+WW5aWlpGjJkiOrUqaPOnTubX6kCAABwo6lyaGrZsqXWr1+vnJwcffjhh4qOjpYk5efnV/uX1AIAANQWVQ5NL7/8shISEnTrrbeqU6dOioqKkvTDqNPlnhQOAABwI6jyIwceeugh3XPPPcrNzVX79u3N+b169dLgwYOrtTkAAIDa4pqe0xQSEqKQkBCXeb/61a+qpSEAAIDaqMqhqUePHrLZbFdcvmXLlp/UEIDa6dbnU9zdAtzsyIz+7m4BcKsqh6Y777zT5XV5ebmysrKUnZ2t4cOHV1dfAAAAtUqVQ9O8efMuOz8xMVFnz579yQ0BAADURlW+e+5Kfv3rX2vp0qXVtTkAAIBapdpCU3p6uurVq1ddmwMAAKhVqnx6bsiQIS6vDcNQbm6u9uzZo5deeqnaGgMAAKhNqhyaHA6Hy+s6deooPDxcU6dONZ8ODgAAcKOpcmhatmxZTfQBAABQq1XbNU0AAAA3siqPNFVUVGjevHn65z//qaNHj6qsrMxl+enTp6utOQAAgNqiyiNNr7zyiubOnauhQ4fK6XRq4sSJGjJkiOrUqaPExMQaaBEAAMD9qhyaVq5cqcWLFyshIUGenp567LHH9Oabb+rll19WRkZGTfQIAADgdlUOTXl5eWrbtq0kqUGDBnI6nZKk2NhYpaTw3VQAAODGVOXQ1KRJE+Xm5kqSWrZsqbS0NEnS7t27Zbfbq7c7AACAWqLKoWnw4MH66KOPJElPP/20XnrpJbVq1Uq/+c1v9MQTT1R7gwAAALVBle+emzFjhvnzQw89pCZNmmjHjh1q2bKlBg4cWK3NAQAA1BZVDk2X6ty5szp37lwdvQAAANRalkLThg0bLG+Q0SYAAHAjshSaHnjgAUsbs9lsqqio+Cn9AAAA1EqWQtP58+drug8AAIBaje+eAwAAsMByaNqyZYsiIiJUVFRUaZnT6dQdd9yhf//731Xa+b///W8NGDBAoaGhstlsWr9+vcvyESNGyGazuUyXXnReWlqqcePGKTAwUD4+Pho4cKCOHTvmUlNQUKC4uDg5HA45HA7FxcWpsLDQpebo0aMaMGCAfHx8FBgYqPHjx1f6Xj0AAHDzshya5s+fr/j4ePn5+VVa5nA4NHr0aM2bN69KOy8uLlb79u21YMGCK9b07dtXubm55vTBBx+4LJ8wYYLWrVun5ORkbd++XWfPnlVsbKzLtVXDhg1TVlaWUlNTlZqaqqysLMXFxZnLKyoq1L9/fxUXF2v79u1KTk7WmjVrNGnSpCq9HwAAcOOy/MiBzz//XK+99toVl0dHR2v27NlV2nm/fv3Ur1+/q9bY7XaFhIRcdpnT6dSSJUu0fPly9e7dW5K0YsUKhYWFafPmzYqJidGBAweUmpqqjIwMderUSZK0ePFiRUVF6eDBgwoPD1daWpr279+vnJwchYaGSpLmzJmjESNG6NVXX71sUAQAADcXyyNNJ06cUN26da+43NPTUydPnqyWpi62detWBQUFqXXr1oqPj1d+fr65LDMzU+Xl5YqOjjbnhYaGKjIyUjt27JAkpaeny+FwmIFJ+uHZUg6Hw6UmMjLSDEySFBMTo9LSUmVmZl6xt9LSUhUVFblMAADgxmQ5NN1yyy3at2/fFZfv3btXjRs3rpamLujXr59WrlypLVu2aM6cOdq9e7d69uyp0tJSST98ebCXl5f8/f1d1gsODlZeXp5ZExQUVGnbQUFBLjXBwcEuy/39/eXl5WXWXM706dPN66QcDofCwsJ+0vsFAAC1l+XQdP/99+vll1/W//73v0rLSkpKNGXKFMXGxlZrc4888oj69++vyMhIDRgwQBs3btRXX32llJSUq65nGIZsNpv5+uKff0rNpSZPniyn02lOOTk5Vt4WAAC4Dlm+pumPf/yj1q5dq9atW2vs2LEKDw+XzWbTgQMH9Le//U0VFRV68cUXa7JXNW7cWM2aNdOhQ4ckSSEhISorK1NBQYHLaFN+fr66dOli1pw4caLStk6ePGmOLoWEhGjnzp0uywsKClReXl5pBOpidrtddrv9J78vAABQ+1keaQoODtaOHTsUGRmpyZMna/DgwXrggQf0wgsvKDIyUp9++ulVA0Z1OHXqlHJycszTgB06dFDdunW1adMmsyY3N1fZ2dlmaIqKipLT6dSuXbvMmp07d8rpdLrUZGdnKzc316xJS0uT3W5Xhw4davQ9AQCA60OVvrC3WbNm+uCDD1RQUKD//ve/MgxDrVq1qnRNkVVnz57Vf//7X/P14cOHlZWVpYCAAAUEBCgxMVEPPvigGjdurCNHjuiFF15QYGCgBg8eLOmHRx2MHDlSkyZNUsOGDRUQEKCEhAS1bdvWvJuuTZs26tu3r+Lj47Vo0SJJ0qhRoxQbG6vw8HBJP9z5FxERobi4OM2aNUunT59WQkLCFR+xAAAAbj5VCk0X+Pv76+677/7JO9+zZ4969Ohhvp44caIkafjw4Vq4cKH27dunt99+W4WFhWrcuLF69Oih1atXy9fX11xn3rx58vT01NChQ1VSUqJevXopKSlJHh4eZs3KlSs1fvx48y67gQMHujwbysPDQykpKRozZoy6du0qb29vDRs2rMqPUAAAADcum2EYhrubuFEUFRXJ4XDI6XRW+wjVrc9f/eJ33PiOzOjv1v3zGYS7P4NATbH695vvngMAALCA0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsIDQBAABYQGgCAACwgNAEAABgAaEJAADAAkITAACABYQmAAAACwhNAAAAFhCaAAAALCA0AQAAWEBoAgAAsIDQBAAAYAGhCQAAwAJCEwAAgAWEJgAAAAsITQAAABYQmgAAACwgNAEAAFhAaAIAALCA0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsIDQBAABYQGgCAACwgNAEAABgAaEJAADAAkITAACABYQmAAAAC9wamv79739rwIABCg0Nlc1m0/r1612WG4ahxMREhYaGytvbW927d9cXX3zhUlNaWqpx48YpMDBQPj4+GjhwoI4dO+ZSU1BQoLi4ODkcDjkcDsXFxamwsNCl5ujRoxowYIB8fHwUGBio8ePHq6ysrCbeNgAAuA65NTQVFxerffv2WrBgwWWXz5w5U3PnztWCBQu0e/duhYSEqE+fPjpz5oxZM2HCBK1bt07Jycnavn27zp49q9jYWFVUVJg1w4YNU1ZWllJTU5WamqqsrCzFxcWZyysqKtS/f38VFxdr+/btSk5O1po1azRp0qSae/MAAOC64unOnffr10/9+vW77DLDMDR//ny9+OKLGjJkiCTprbfeUnBwsFatWqXRo0fL6XRqyZIlWr58uXr37i1JWrFihcLCwrR582bFxMTowIEDSk1NVUZGhjp16iRJWrx4saKionTw4EGFh4crLS1N+/fvV05OjkJDQyVJc+bM0YgRI/Tqq6/Kz8/vZzgaAACgNqu11zQdPnxYeXl5io6ONufZ7XZ169ZNO3bskCRlZmaqvLzcpSY0NFSRkZFmTXp6uhwOhxmYJKlz585yOBwuNZGRkWZgkqSYmBiVlpYqMzPzij2WlpaqqKjIZQIAADemWhua8vLyJEnBwcEu84ODg81leXl58vLykr+//1VrgoKCKm0/KCjIpebS/fj7+8vLy8usuZzp06eb10k5HA6FhYVV8V0CAIDrRa0NTRfYbDaX14ZhVJp3qUtrLld/LTWXmjx5spxOpznl5ORctS8AAHD9qrWhKSQkRJIqjfTk5+ebo0IhISEqKytTQUHBVWtOnDhRafsnT550qbl0PwUFBSovL680AnUxu90uPz8/lwkAANyYam1oat68uUJCQrRp0yZzXllZmbZt26YuXbpIkjp06KC6deu61OTm5io7O9usiYqKktPp1K5du8yanTt3yul0utRkZ2crNzfXrElLS5PdbleHDh1q9H0CAIDrg1vvnjt79qz++9//mq8PHz6srKwsBQQEqGnTppowYYKmTZumVq1aqVWrVpo2bZrq16+vYcOGSZIcDodGjhypSZMmqWHDhgoICFBCQoLatm1r3k3Xpk0b9e3bV/Hx8Vq0aJEkadSoUYqNjVV4eLgkKTo6WhEREYqLi9OsWbN0+vRpJSQkKD4+ntEjAAAgyc2hac+ePerRo4f5euLEiZKk4cOHKykpSc8++6xKSko0ZswYFRQUqFOnTkpLS5Ovr6+5zrx58+Tp6amhQ4eqpKREvXr1UlJSkjw8PMyalStXavz48eZddgMHDnR5NpSHh4dSUlI0ZswYde3aVd7e3ho2bJhmz55d04cAAABcJ2yGYRjubuJGUVRUJIfDIafTWe0jVLc+n1Kt28P158iM/m7dP59BuPszCNQUq3+/a+01TQAAALUJoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsIDQBAABYQGgCAACwgNAEAABgAaEJAADAAkITAACABYQmAAAACwhNAAAAFhCaAAAALCA0AQAAWEBoAgAAsIDQBAAAYAGhCQAAwAJCEwAAgAWEJgAAAAsITQAAABYQmgAAACwgNAEAAFhAaAIAALCA0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsIDQBAABYQGgCAACwgNAEAABgAaEJAADAAkITAACABYQmAAAACwhNAAAAFtTq0JSYmCibzeYyhYSEmMsNw1BiYqJCQ0Pl7e2t7t2764svvnDZRmlpqcaNG6fAwED5+Pho4MCBOnbsmEtNQUGB4uLi5HA45HA4FBcXp8LCwp/jLQIAgOtErQ5NknTHHXcoNzfXnPbt22cumzlzpubOnasFCxZo9+7dCgkJUZ8+fXTmzBmzZsKECVq3bp2Sk5O1fft2nT17VrGxsaqoqDBrhg0bpqysLKWmpio1NVVZWVmKi4v7Wd8nAACo3Tzd3cCP8fT0dBldusAwDM2fP18vvviihgwZIkl66623FBwcrFWrVmn06NFyOp1asmSJli9frt69e0uSVqxYobCwMG3evFkxMTE6cOCAUlNTlZGRoU6dOkmSFi9erKioKB08eFDh4eE/35sFAAC1Vq0faTp06JBCQ0PVvHlzPfroo/rmm28kSYcPH1ZeXp6io6PNWrvdrm7dumnHjh2SpMzMTJWXl7vUhIaGKjIy0qxJT0+Xw+EwA5Mkde7cWQ6Hw6y5ktLSUhUVFblMAADgxlSrQ1OnTp309ttv68MPP9TixYuVl5enLl266NSpU8rLy5MkBQcHu6wTHBxsLsvLy5OXl5f8/f2vWhMUFFRp30FBQWbNlUyfPt28DsrhcCgsLOya3ysAAKjdanVo6tevnx588EG1bdtWvXv3VkpKiqQfTsNdYLPZXNYxDKPSvEtdWnO5eivbmTx5spxOpznl5OT86HsCAADXp1odmi7l4+Ojtm3b6tChQ+Z1TpeOBuXn55ujTyEhISorK1NBQcFVa06cOFFpXydPnqw0inUpu90uPz8/lwkAANyYrqvQVFpaqgMHDqhx48Zq3ry5QkJCtGnTJnN5WVmZtm3bpi5dukiSOnTooLp167rU5ObmKjs726yJioqS0+nUrl27zJqdO3fK6XSaNQAAALX67rmEhAQNGDBATZs2VX5+vv785z+rqKhIw4cPl81m04QJEzRt2jS1atVKrVq10rRp01S/fn0NGzZMkuRwODRy5EhNmjRJDRs2VEBAgBISEszTfZLUpk0b9e3bV/Hx8Vq0aJEkadSoUYqNjeXOOQAAYKrVoenYsWN67LHH9N1336lRo0bq3LmzMjIy1KxZM0nSs88+q5KSEo0ZM0YFBQXq1KmT0tLS5Ovra25j3rx58vT01NChQ1VSUqJevXopKSlJHh4eZs3KlSs1fvx48y67gQMHasGCBT/vmwUAALWazTAMw91N3CiKiorkcDjkdDqr/fqmW59Pqdbt4fpzZEZ/t+6fzyDc/RkEaorVv9/X1TVNAAAA7kJoAgAAsIDQBAAAYAGhCQAAwAJCEwAAgAWEJgAAAAsITQAAABYQmgAAACwgNAEAAFhAaAIAALCA0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsIDQBAABYQGgCAACwgNAEAABgAaEJAADAAkITAACABYQmAAAACwhNAAAAFhCaAAAALCA0AQAAWEBoAgAAsIDQBAAAYAGhCQAAwAJPdzcAAIAVtz6f4u4W4GZHZvR36/4ZaQIAALCA0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsIDRd4u9//7uaN2+uevXqqUOHDvrkk0/c3RIAAKgFCE0XWb16tSZMmKAXX3xRn332me69917169dPR48edXdrAADAzQhNF5k7d65Gjhyp3/3ud2rTpo3mz5+vsLAwLVy40N2tAQAANyM0/Z+ysjJlZmYqOjraZX50dLR27Njhpq4AAEBtwXfP/Z/vvvtOFRUVCg4OdpkfHBysvLy8y65TWlqq0tJS87XT6ZQkFRUVVXt/50u/r/Zt4vpSE5+rquAzCD6DcLea+gxe2K5hGFetIzRdwmazubw2DKPSvAumT5+uV155pdL8sLCwGukNNzfHfHd3gJsdn0G4W01/Bs+cOSOHw3HF5YSm/xMYGCgPD49Ko0r5+fmVRp8umDx5siZOnGi+Pn/+vE6fPq2GDRteMWjh2hQVFSksLEw5OTny8/Nzdzu4CfEZhLvxGaw5hmHozJkzCg0NvWodoen/eHl5qUOHDtq0aZMGDx5szt+0aZMGDRp02XXsdrvsdrvLvF/84hc12eZNz8/Pj38s4FZ8BuFufAZrxtVGmC4gNF1k4sSJiouLU8eOHRUVFaV//OMfOnr0qH7/+9+7uzUAAOBmhKaLPPLIIzp16pSmTp2q3NxcRUZG6oMPPlCzZs3c3RoAAHAzQtMlxowZozFjxri7DVzCbrdrypQplU6HAj8XPoNwNz6D7mczfuz+OgAAAPBwSwAAACsITQAAABYQmgAAACwgNAEAAFhAaEKtNGDAAPXu3fuyy9LT02Wz2fSf//znZ+4KN6P8/HyNHj1aTZs2ld1uV0hIiGJiYpSenu7u1nCDq6ioUJcuXfTggw+6zHc6nQoLC9Mf//hHN3V28yI0oVYaOXKktmzZom+//bbSsqVLl+rOO+/UXXfd5YbOcLN58MEH9fnnn+utt97SV199pQ0bNqh79+46ffq0u1vDDc7Dw0NvvfWWUlNTtXLlSnP+uHHjFBAQoJdfftmN3d2ceOQAaqVz586pSZMmevLJJzVlyhRz/vfff6+QkBBNmzZNY8eOdWOHuBkUFhbK399fW7duVbdu3dzdDm5Sf/nLX5SYmKjs7Gzt3r1bDz/8sHbt2qU777zT3a3ddBhpQq3k6emp3/zmN0pKStLFuf5f//qXysrK9Pjjj7uxO9wsGjRooAYNGmj9+vUqLS11dzu4SY0bN07t27fXb37zG40aNUovv/wygclNGGlCrfXll1+qTZs22rJli3r06CFJ6tatm2655RatWrXKzd3hZrFmzRrFx8erpKREd911l7p166ZHH31U7dq1c3druIlc+Pewbdu2+s9//iNPT77Qwx0YaUKtdfvtt6tLly5aunSpJOnrr7/WJ598oieeeMLNneFm8uCDD+r48ePasGGDYmJitHXrVt11111KSkpyd2u4iSxdulT169fX4cOHdezYMXe3c9NipAm12tKlSzV27Fjl5eVp5syZWrlypb755hvZbDZ3t4ab2O9+9ztt2rTpsjcqANUtPT1d9913nzZu3KiZM2eqoqJCmzdv5t9BN2CkCbXa0KFD5eHhoVWrVumtt97Sb3/7W/6hgNtFRESouLjY3W3gJlBSUqLhw4dr9OjR6t27t958803t3r1bixYtcndrNyVGmlDr/e53v9PatWvldDp1+PBhNW3a1N0t4SZx6tQpPfzww3riiSfUrl07+fr6as+ePRo3bpz69++vJUuWuLtF3OCefvpppaSk6PPPP5ePj48kafHixZo4caL27dunW2+91b0N3mQITaj10tPT1aVLF0VHR+vDDz90dzu4iZSWlioxMVFpaWn6+uuvVV5errCwMD388MN64YUX5O3t7e4WcQPbtm2bevXqpa1bt+qee+5xWRYTE6Nz585xmu5nRmgCAACwgGuaAAAALCA0AQAAWEBoAgAAsIDQBAAAYAGhCQAAwAJCEwAAgAWEJgAAAAsITQBQRd27d9eECRMs12/dulU2m02FhYU/ab+33nqr5s+f/5O2AeDaEZoA1HojRoyQzWaTzWaTp6enmjZtqieffFIFBQXVvp8HHnigWrcJ4MZBaAJwXejbt69yc3N15MgRvfnmm3r//fc1ZswYd7cF4CZCaAJwXbDb7QoJCVGTJk0UHR2tRx55RGlpaeby8+fPa+rUqWrSpInsdrvuvPNOpaamumxj37596tmzp7y9vdWwYUONGjVKZ8+elSQlJibqrbfe0nvvvWeOam3dutVSbytWrFDHjh3l6+urkJAQDRs2TPn5+ZXqPv30U7Vv31716tVTp06dtG/fPpflO3bs0H333Sdvb2+FhYVp/PjxKi4uruKRAlBTCE0ArjvffPONUlNTVbduXXPe66+/rjlz5mj27Nnau3evYmJiNHDgQB06dEiS9P3336tv377y9/fX7t279a9//UubN2/W2LFjJUkJCQkaOnSoOaKVm5urLl26WOqnrKxMf/rTn/T5559r/fr1Onz4sEaMGFGp7g9/+INmz56t3bt3KygoSAMHDlR5ebmkHwJdTEyMhgwZor1792r16tXavn272R+AWsAAgFpu+PDhhoeHh+Hj42PUq1fPkGRIMubOnWvWhIaGGq+++qrLenfffbcxZswYwzAM4x//+Ifh7+9vnD171lyekpJi1KlTx8jLyzP3M2jQoB/tp1u3bsbTTz99xeW7du0yJBlnzpwxDMMwPv74Y0OSkZycbNacOnXK8Pb2NlavXm0YhmHExcUZo0aNctnOJ598YtSpU8coKSkxDMMwmjVrZsybN+9H+wNQMxhpAnBd6NGjh7KysrRz506NGzdOMTExGjdunCSpqKhIx48fV9euXV3W6dq1qw4cOCBJOnDggNq3by8fHx+X5efPn9fBgwd/Um+fffaZBg0apGbNmsnX11fdu3eXJB09etSlLioqyvw5ICBA4eHhZn+ZmZlKSkpSgwYNzCkmJkbnz5/X4cOHf1J/AKoHoQnAdcHHx0ctW7ZUu3bt9Je//EWlpaV65ZVXXGpsNpvLa8MwzHkX/3ypK823ori4WNHR0WrQoIFWrFih3bt3a926dZJ+OG33Yy7s+/z58xo9erSysrLM6fPPP9ehQ4fUokWLa+4PQPUhNAG4Lk2ZMkWzZ8/W8ePH5efnp9DQUG3fvt2lZseOHWrTpo0kKSIiQllZWS4XVn/66aeqU6eOWrduLUny8vJSRUVFlfr48ssv9d1332nGjBm69957dfvtt1/2InBJysjIMH8uKCjQV199pdtvv12SdNddd+mLL75Qy5YtK01eXl5V6glAzSA0Abgude/eXXfccYemTZsm6YeLrF977TWtXr1aBw8e1PPPP6+srCw9/fTTkqTHH39c9erV0/Dhw5Wdna2PP/5Y48aNU1xcnIKDgyX98PDIvXv36uDBg/ruu+/Mi7SvpmnTpvLy8tJf//pXffPNN9qwYYP+9Kc/XbZ26tSp+uijj5Sdna0RI0YoMDDQfC7Uc889p/T0dD311FPKysrSoUOHtGHDBvMUJAD3IzQBuG5NnDhRixcvVk5OjsaPH69JkyZp0qRJatu2rVJTU7Vhwwa1atVKklS/fn19+OGHOn36tO6++2499NBD6tWrlxYsWGBuLz4+XuHh4erYsaMaNWqkTz/99Ed7aNSokZKSkvSvf/1LERERmjFjhmbPnn3Z2hkzZujpp59Whw4dlJubqw0bNpijSO3atdO2bdt06NAh3XvvvfrlL3+pl156SY0bN66GIwWgOtgMwzDc3QQAAEBtx0gTAACABYQmAAAACwhNAAAAFhCaAAAALCA0AQAAWEBoAgAAsIDQBAAAYAGhCQAAwAJCEwAAgAWEJgAAAAsITQAAABYQmgAAACz4/wAB2sUKYGbL1QAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "root_counts = dict()\n", + "for sketch, count in sketch_counter.items():\n", + " root_counts[sketch[1]] = root_counts.get(sketch[1], 0) + count\n", + "\n", + "plt.bar(range(len(root_counts)), list(root_counts.values()), align='center')\n", + "plt.xticks(range(len(root_counts)), list(root_counts.keys()))\n", + "plt.title('Root label counts in clauses')\n", + "plt.xlabel('Root label')\n", + "plt.ylabel('Clause count')\n", + "plt.show()\n", + "\n", + "root_counts = dict()\n", + "for sketch, count in sketch_counter.items():\n", + " root_counts[sketch[1]] = root_counts.get(sketch[1], 0) + 1\n", + " \n", + "plt.bar(range(len(root_counts)), list(root_counts.values()), align='center')\n", + "plt.xticks(range(len(root_counts)), list(root_counts.keys()))\n", + "plt.title('Root label counts in sketches')\n", + "plt.xlabel('Root label')\n", + "plt.ylabel('Sketch count')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "1f373fb2", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "subtree_counts = dict()\n", + "for sketch, count in sketch_counter.items():\n", + " subtree_count = sketch.count(')')\n", + " subtree_counts[subtree_count] = subtree_counts.get(subtree_count, 0) + count\n", + "subtree_counts = dict(sorted(subtree_counts.items())) \n", + "\n", + "plt.bar(range(len(subtree_counts)), list(subtree_counts.values()), align='center')\n", + "plt.xticks(range(len(subtree_counts)), list(subtree_counts.keys()))\n", + "plt.title('Subtree counts in clauses')\n", + "plt.xlabel('Subtree size')\n", + "plt.ylabel('Clause count')\n", + "plt.show() \n", + "\n", + "subtree_counts = dict()\n", + "for sketch, count in sketch_counter.items():\n", + " subtree_count = sketch.count(')')\n", + " subtree_counts[subtree_count] = subtree_counts.get(subtree_count, 0) + 1\n", + "subtree_counts = dict(sorted(subtree_counts.items())) \n", + "\n", + "\n", + "plt.bar(range(len(subtree_counts)), list(subtree_counts.values()), align='center')\n", + "plt.xticks(range(len(subtree_counts)), list(subtree_counts.keys()))\n", + "plt.title('Subtree counts in sketches')\n", + "plt.xlabel('Subtree size')\n", + "plt.ylabel('Sketch count')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "9db8626a", + "metadata": {}, + "source": [ + "## III. Save top 50 sketches for knock-out experiments" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f8a44838", + "metadata": {}, + "outputs": [], + "source": [ + "DataFrame({\n", + " 'sketch': [sketch for sketch, _ in sketch_counter.most_common(50)],\n", + " 'support': [count for _, count in sketch_counter.most_common(50)] \n", + "}).to_csv( knockout_list_file )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/01d_prepare_sketches.py b/01d_prepare_sketches.py new file mode 100644 index 00000000..ba659a84 --- /dev/null +++ b/01d_prepare_sketches.py @@ -0,0 +1,632 @@ +# +# Creates frequency table of syntax sketches, and prepares datasets +# for sketches knockout experiments: removes clauses corresponding +# to sketches systematically from train, dev and test sets. +# + +import os +import sys +import configparser +from datetime import datetime +from collections import Counter + +from typing import List, Tuple, Dict, Any + +import os, os.path +from random import Random + +from pandas import DataFrame +from pandas import read_csv + +from syntax_sketches.syntax_sketch import safe_sketch_name +from syntax_sketches.syntax_sketch import extract_sketches +from syntax_sketches.syntax_sketch import remove_sketches +from syntax_sketches.syntax_sketch import remove_sketches_group +from syntax_sketches.syntax_sketch import rand_group_sketches +from syntax_sketches.syntax_sketch import compute_sketches +from syntax_sketches.clause_import import import_clauses +from syntax_sketches.clause_export import remove_extracted_from_conllu_and_dicts + +def prepare_sketches_main( conf_file ): + ''' + Creates frequency table of syntax sketches for a corpus, and/or + prepares datasets for sketches knockout experiments: removes + clauses corresponding to sketches systematically from train, + dev and test sets. + Inputs/outputs and parameters of the processing will be read + from the given `conf_file`. + Executes sections in the configuration starting with prefix + 'make_sketches_table_' and 'prepare_knockout_'. + ''' + # Parse configuration file + config = configparser.ConfigParser() + if conf_file is None or not os.path.exists(conf_file): + raise FileNotFoundError("Config file {} does not exist".format(conf_file)) + if len(config.read(conf_file)) != 1: + raise ValueError("File {} is not accessible or is not in valid INI format".format(conf_file)) + section_found = False + start = datetime.now() + for section in config.sections(): + if section.startswith('make_sketches_table_'): + section_found = True + print(f'Performing {section} ...') + # Collect sketch computing parameters + if not config.has_option(section, 'input_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "input_dir" parameter.') + input_dir = config[section]['input_dir'] + if not os.path.isdir(input_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "input_dir" value {input_dir!r} in {section!r}.') + if not config.has_option(section, 'output_csv_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_csv_file" parameter.') + output_csv_file = config[section]['output_csv_file'] + top_n = config[section].getint('top_n', 50) + skip_list = config[section].get('skip_list', 'train_full.conllu') + if len(skip_list) > 0: + skip_list = skip_list.split(',') + skip_list = [fname.strip() for fname in skip_list] + # Compute sketches frequency TOP N and save into CSV file + compute_sketches_freq_table( input_dir, top_n, output_csv_file, skip_files=skip_list, verbose=True ) + elif section.startswith('prepare_knockout_'): + section_found = True + print(f'Performing {section} ...') + # Collect knockout preparation parameters + # input_dir -- contains train/dev/test conllu files with clauses + if not config.has_option(section, 'input_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "input_dir" parameter.') + input_dir = config[section]['input_dir'] + if not os.path.isdir(input_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "input_dir" value {input_dir!r} in {section!r}.') + # top_sketches_file -- CSV file containing top_n sketches (created via 'make_sketches_table_') + if not config.has_option(section, 'top_sketches_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "top_sketches_file" parameter.') + top_sketches_file = config[section]['top_sketches_file'] + if not os.path.isfile(top_sketches_file): + raise FileNotFoundError(f'Error in {conf_file}: invalid or missing "top_sketches_file" value '+\ + f'{top_sketches_file!r} in {section!r}.') + # min_support -- the minimun number of clauses from top n sketches in test panel. + # If test set does not contain at least min_support clauses of a sketch, then missing + # number of clauses will be extracted and removed from the train set. + min_support = config[section].getint('min_support', 50) + # random_groups -- distributes top_n_sketches randomly into random_groups bins, + # and prepares knockout splits on groups instead of on all sketches. + # If None (default), then prepares knockout splits on all top_n_sketches separately. + random_groups = config[section].getint('random_groups', None) + grouping_seed = config[section].getint('grouping_seed', 1) + top_sketches_grouped_file = config[section].get('top_sketches_grouped_file', None) + # create_control -- instead of preparing knockout data on top_n_sketches or corresponding groups, + # pick same amounts of clauses randomly for control experiment. + create_control = config[section].getboolean('create_control', False) + create_control_seed = config[section].getint('create_control_seed', 5) + # initial_output_dir -- for saving test panel and pure train data + if not config.has_option(section, 'initial_output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "initial_output_dir" parameter.') + initial_output_dir = config[section]['initial_output_dir'] + initial_train_fname = config[section].get('initial_train_fname', 'train_pure.conllu') + initial_test_fname = config[section].get('initial_test_fname', None) + # final_output_dir -- for saving knockout splits data + if not config.has_option(section, 'final_output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "final_output_dir" parameter.') + final_output_dir = config[section]['final_output_dir'] + skip_list = config[section].get('skip_list', 'train_full.conllu') + if len(skip_list) > 0: + skip_list = skip_list.split(',') + skip_list = [fname.strip() for fname in skip_list] + # Load input data + whole_data_map = load_clauses_datasets(input_dir, skip_files=skip_list) + top_n_sketches = read_csv(top_sketches_file, index_col=0).values.tolist() + # Create output directories and required file names + if not os.path.exists(initial_output_dir): + os.makedirs(initial_output_dir, exist_ok=True) + if not os.path.exists(final_output_dir): + os.makedirs(final_output_dir, exist_ok=True) + if initial_test_fname is None: + initial_test_fname = f'test_{len(top_n_sketches)}x{min_support}.conllu' + initial_test_fname = os.path.join(initial_output_dir, initial_test_fname) + initial_train_fname = os.path.join(initial_output_dir, initial_train_fname) + if random_groups is not None: + # ================================== + # Knockout grouped top sketches + # ================================== + if random_groups is not None and top_sketches_grouped_file is None: + top_sketches_grouped_file = f'top_{len(top_n_sketches)}_sketches_{random_groups}_groups.csv' + # Put top_sketches_grouped_file into same folder as top_sketches_file + head, tail = os.path.split( top_sketches_file ) + if len(head) > 0: + top_sketches_grouped_file = os.path.join( head, top_sketches_grouped_file ) + # Group sketches + grouped_sketches = group_top_sketches_randomly(top_n_sketches, random_groups, top_sketches_grouped_file, + seed=grouping_seed) + # Create test panel and pure train + test_data, test_data_sketches = \ + create_test_panel_and_pure_train(top_n_sketches, whole_data_map, initial_test_fname, + initial_train_fname, min_support=min_support) + # Create knockout data + control_removed = create_knockout_files_from_grouped_sketches(grouped_sketches, test_data_sketches, + test_data, whole_data_map, final_output_dir, + dry_run=create_control) + if create_control: + # Create randomized control data + create_random_control_files_from_grouped_sketches(grouped_sketches, test_data_sketches, test_data, + whole_data_map, control_removed, final_output_dir, + seed=create_control_seed) + else: + # ================================== + # Knockout all top sketches + # ================================== + # Create test panel and pure train + test_data, test_data_sketches = \ + create_test_panel_and_pure_train(top_n_sketches, whole_data_map, initial_test_fname, + initial_train_fname, min_support=min_support) + # Create knockout data + control_removed = create_knockout_files_from_sketches(test_data_sketches, test_data, whole_data_map, + final_output_dir, dry_run=create_control) + if create_control: + # Create randomized control data + create_random_control_files_from_sketches(test_data_sketches, test_data, whole_data_map, + control_removed, final_output_dir, + seed=create_control_seed) + if section_found: + print(f'Total processing time: {datetime.now()-start}') + else: + print(f'No section starting with "make_sketches_table_" or "prepare_knockout_" in {conf_file}.') + +# ============================================= +# Make sketches frequency table +# ============================================= + +def compute_sketches_freq_table(input_dir:str, N:int, output_file:str, skip_files:List[str]=[], verbose:bool=True): + ''' + Computes syntax sketches for conllu files in the input_dir, extracts top N most frequent sketches + and saves into CSV file output_file (must end with '.csv'). + Assumes that all conllu files in the input_dir have been created via script "01b_extract_clauses.py", + that is, they contain clauses instead of sentences. + Optionally, you can skip some of the conllu input files from computations via parameter skip_files. + Returns DataFrame with most frequent sketches and their frequencies. + ''' + assert output_file.endswith('.csv'), f'(!) Unexpected file ending for a csv file {output_file!r}' + sketches, clauses_count_total = compute_sketches( input_dir, skip_files=skip_files, verbose=verbose ) + sketch_counter = Counter(sketches) + df = DataFrame({ + 'sketch': [sketch for sketch, _ in sketch_counter.most_common(N)], + 'support': [count for _, count in sketch_counter.most_common(N)] + }) + if verbose: + print(f'Writing sketches freq table into {output_file}.') + df.to_csv( output_file ) + return df + +# ============================================= +# Prepare sketches data for knockout exp +# ============================================= + +def load_clauses_datasets(input_dir:str, skip_files:List[str]=[], verbose:bool=True): + ''' + Loads train, dev and test datasets from conllu files in the input_dir. + The input_dir must contain exactly one file of each type. + Assumes that all conllu files in the input_dir have been created via + script "01b_extract_clauses.py", that is, they contain clauses instead + of sentences. + Returns dictionary mapping keys {'train', 'dev', 'test'} to lists + [file_name, list_of_clause_conllu_strings, list_of_clause_dicts]. + ''' + # Load data from conllu files + whole_data = [] + for fname in os.listdir(input_dir): + if fname in skip_files: + continue + if fname.endswith('.conllu'): + # import clauses as conllu strings + clauses = import_clauses( os.path.join(input_dir, fname), as_dicts=False ) + # import clauses as dicts (required for sketch creation) + clause_dicts = import_clauses( os.path.join(input_dir, fname), as_dicts=True ) + # sanity check + assert len(clauses) == len(clause_dicts), f'{len(clauses)} vs {len(clause_dicts)}' + if verbose: + print( fname, f'| #clauses: {len(clauses)}' ) + whole_data.append( (fname, clauses, clause_dicts) ) + # Map loaded train, test, dev sets to named shortcuts + whole_data_map = {} + for (fname, clauses, clause_dicts) in whole_data: + if 'train' in fname: + assert 'train' not in whole_data_map.keys(), \ + f'More than 1 train files: {fname} vs {whole_data_map["train"][0]}' + whole_data_map['train'] = [fname, clauses, clause_dicts] + elif 'dev' in fname: + assert 'dev' not in whole_data_map.keys(), \ + f'More than 1 dev files: {fname} vs {whole_data_map["dev"][0]}' + whole_data_map['dev'] = [fname, clauses, clause_dicts] + elif 'test' in fname: + assert 'test' not in whole_data_map.keys(), \ + f'More than 1 test files: {fname} vs {whole_data_map["test"][0]}' + whole_data_map['test'] = [fname, clauses, clause_dicts] + assert whole_data_map.keys() == {'train', 'dev', 'test'} + return whole_data_map + +def write_conllu_file(out_fpath:str, clauses_conllus:List[str]): + ''' + Saves clause conllu strings (clauses_conllus) into file out_fpath. + An existing file will be overwritten. + ''' + assert out_fpath.endswith('.conllu'), \ + f'(!) Unexpected output file ending in {out_fpath}. Expected file ending with .conllu' + with open(out_fpath, 'w', encoding='utf-8') as fout: + fout.write('\n\n'.join( clauses_conllus )) + # Add final empty line (to avoid UDError: The CoNLL-U file does not end with empty line) + fout.write('\n\n') + +def create_test_panel_and_pure_train(top_sketches_list:List[Any], whole_data_map:Dict[str,List[Any]], + test_file:str, train_file:str, min_support:int=50, + verbose:bool=True) -> Tuple[List[str], List[List[str]]]: + ''' + Creates a panel test set, extracting all occurrences of top_sketches_list from test set. + If test set does not contain at least min_support (default: 50) clauses of a sketch, + then extracts and removes missing number of clauses from the train set. + Saves the panel test set into test_fname, and purified train set (that has no overlap + with test panel) into train_file. + Returns tuple (list_of_sketch_names, list_of_corresponding_clause_conllus) + ''' + c = 0 + test_data = [] + test_data_sketches = [] + for sketch, support in sorted(top_sketches_list, key=lambda x:x[1], reverse=True): + # Extract all sketches from the test set + test_conllu = whole_data_map['test'][1] + test_dicts = whole_data_map['test'][2] + extracted_conllu, extracted_dicts, extracted_amount = \ + extract_sketches(test_conllu, test_dicts, sketch) + if verbose: + print(sketch) + print( f' extracted {extracted_amount} clauses from test' ) + test_data.append( extracted_conllu ) + test_data_sketches.append( sketch ) + if extracted_amount < min_support: + # If the test set does not contain enough instances (at least min_support), + # we take them form the training set. There are enough clauses in the training + # set so this should not effect the overall performance. + train_conllu = whole_data_map['train'][1] + train_dicts = whole_data_map['train'][2] + extracted_train_conllu, extracted_train_dicts, extracted_train_amount = \ + extract_sketches(train_conllu, train_dicts, sketch, amount=min_support-extracted_amount) + if verbose: + print( f' extracted {extracted_train_amount} clauses from train' ) + # Extend the last sub list in new test + test_data[-1].extend( extracted_train_conllu ) + # Remove extracted clauses from train + new_train_conllu, new_train_dicts, deletion_counts = \ + remove_extracted_from_conllu_and_dicts(train_conllu, train_dicts, extracted_train_conllu) + assert len(new_train_conllu) == len(train_conllu) - sum(deletion_counts.values()) + assert len(new_train_dicts) == len(train_dicts) - sum(deletion_counts.values()) + # Update train (make it pure from test) + whole_data_map['train'][1] = new_train_conllu + whole_data_map['train'][2] = new_train_dicts + c += 1 + assert len(test_data) == len(test_data_sketches) + # Write out test panel file + flat_test_data = [line for conll_list in test_data for line in conll_list] + write_conllu_file( test_file, flat_test_data ) + if verbose: + print(f' Saved {len(flat_test_data)} test clauses into {test_file}.') + # Write out purified train file (all clauses from test panel have been removed) + write_conllu_file( train_file, whole_data_map['train'][1] ) + if verbose: + print(f' Saved {len(whole_data_map["train"][1])} train clauses into {train_file}.') + return test_data, test_data_sketches + +# ================================== +# Knockout grouped top sketches +# ================================== + +def group_top_sketches_randomly(top_sketches_list:List[Any], N:int, output_csv_file:str, seed:int=1, + flatten_groups:bool=True): + ''' + Distributes items of top_sketches_list randomly into N roughly equal size bins. + Assumes top_sketches_list is a list of lists, where each sub list contains two + items: sketch name (str) and its frequency (int). + Returns list of lists of lists, where there are N sub lists and each sub list + contains two item lists (sketch name and support). + If flatten_groups==True (default), then returns list of lists, where there are + N sub lists and each sub list contains only sketch names. + ''' + grouped = rand_group_sketches( top_sketches_list, N, seed=seed ) + DataFrame({ + 'grouped_sketches': [';'.join([s[0] for s in gr]) for gr in grouped], + 'support': [sum([s[1] for s in gr]) for gr in grouped] + }).to_csv( output_csv_file ) + if flatten_groups: + # Flatten list: remove sketch frequencies, + # keep only sketch names + new_grouped = [] + for group in grouped: + assert all([isinstance(sketch[0], str) for sketch in group]) + sketches_list = [sketch[0] for sketch in group] + new_grouped.append(sketches_list) + grouped = new_grouped + return grouped + +def create_knockout_files_from_grouped_sketches(grouped_sketches:List[Any], test_data_sketches:List[str], + test_data:List[List[str]], whole_data_map:Dict[str,List[Any]], + output_dir:str, dry_run:bool=False, verbose:bool=True) -> List[Tuple[int, int, int]]: + ''' + Creates knockout train, dev and test files based on the grouped_sketches. + For each sketch group, removes all clauses corresponding to sketches of the + group from train and dev, and creates a subset of test set containing only + clauses corresponding to sketches of the group. + Saves resulting files into output_dir, under names 'test_group{GID}.conllu', + 'train_group{GID}.conllu', 'dev_group{GID}.conllu', where GID is the index + of the sketch group. + If dry_run==True, then only imitates saving sketches into files, but does + not write any actual files. Use this option if you want to collect removal + statistics for preparation of control experiments. + Returns tuple: + (sketch group index, no of clauses removed from train, no of clauses removed from dev) + ''' + assert len(test_data_sketches) == len(test_data) + control_removed = [] # (sketch group, no of clauses removed from train, no of clauses removed from dev) + for gid, group in enumerate( grouped_sketches ): + assert isinstance(group, list) and all(isinstance(s, str) for s in group) + if verbose: + print(f'group: {gid}') + # + # Subset for test + # + # Collect all test clauses of given group + group_clauses = [] + for sketch, test_conll in zip(test_data_sketches, test_data): + if sketch in group: + if verbose: + print(sketch) + group_clauses.extend(test_conll) + out_test_fname = f'test_group{gid}.conllu' + if not dry_run: + write_conllu_file( os.path.join(output_dir, out_test_fname), group_clauses ) + if verbose: + print(f' Saved {len(group_clauses)} test clauses into {out_test_fname}.') + # + # Extract subset of dev + # + all_conllu = whole_data_map['dev'][1] + all_dicts = whole_data_map['dev'][2] + # Remove and return remaining clauses + # Note: the removal is virtual, actual lists are not affected + preserved_conllu, preserved_dicts, removed_amount_dev = \ + remove_sketches_group(all_conllu, all_dicts, group) + out_dev_fname = f'dev_group{gid}.conllu' + if not dry_run: + write_conllu_file( os.path.join(output_dir, out_dev_fname), preserved_conllu ) + if verbose: + print(f' Removed {removed_amount_dev} clauses and saved remaining {len(preserved_conllu)} dev clauses into {out_dev_fname}.') + # + # Extract subset of train + # + # Assume that instances appearing in test set + # have already been removed in previous steps + # (we have a "pure train set") + all_conllu = whole_data_map['train'][1] + all_dicts = whole_data_map['train'][2] + # Remove and return remaining clauses + # Note: the removal is virtual, actual lists are not affected + preserved_conllu, preserved_dicts, removed_amount_train = \ + remove_sketches_group(all_conllu, all_dicts, group) + out_train_fname = f'train_group{gid}.conllu' + if not dry_run: + write_conllu_file( os.path.join(output_dir, out_train_fname), preserved_conllu ) + if verbose: + print(f' Removed {removed_amount_train} clauses and saved remaining {len(preserved_conllu)} train clauses into {out_train_fname}.') + # Remember how much we removed (for control experiments) + control_removed.append( (gid, removed_amount_train, removed_amount_dev) ) + return control_removed + + +def create_random_control_files_from_grouped_sketches(grouped_sketches:List[Any], test_data_sketches:List[str], + test_data:List[List[str]], whole_data_map:Dict[str,List[Any]], + control_removed:List[Tuple[int, int, int]], output_dir:str, + seed:int=5, verbose:bool=True) -> List[Tuple[int, int, int]]: + ''' + Creates random control train, dev and test files for grouped knockout experiments. + For each sketch group, removes randomly the same amount of clauses that were removed + in knockout dataset preparation (create_knockout_files_from_grouped_sketches) from + train and dev files. Group's test set will be the same as in knockout dataset + preparation. + Saves resulting files into output_dir, under names 'test_group{GID}.conllu', + 'train_group{GID}.conllu', 'dev_group{GID}.conllu', where GID is the index + of the sketch group. + ''' + assert len(test_data_sketches) == len(test_data) + rnd = Random() + rnd.seed(seed) + for (gid, removed_from_train, removed_from_dev) in control_removed: + group = grouped_sketches[gid] + assert isinstance(group, list) and all(isinstance(s, str) for s in group) + if verbose: + print(f'group: {gid}') + # + # Subset for test + # + # Collect all test clauses of given group + group_clauses = [] + for sketch, test_conll in zip(test_data_sketches, test_data): + if sketch in group: + if verbose: + print(sketch) + group_clauses.extend(test_conll) + out_test_fname = f'test_group{gid}.conllu' + write_conllu_file( os.path.join(output_dir, out_test_fname), group_clauses ) + if verbose: + print(f' Saved {len(group_clauses)} test clauses into {out_test_fname}.') + # + # Pick randomly same amount of clauses from train & dev + # + train_pure = whole_data_map['train'][1] + dev_full = whole_data_map['dev'][1] + train_sample = rnd.sample(range(0, len(train_pure)+1), removed_from_train) + dev_sample = rnd.sample(range(0, len(dev_full)+1), removed_from_dev) + # + # Remove randomly picked clause instances + # + ablation_train = [] + ablation_dev = [] + for i, conllu in enumerate(train_pure): + if i in train_sample: + continue + ablation_train.append(conllu) + for i, conllu in enumerate(dev_full): + if i in dev_sample: + continue + ablation_dev.append(conllu) + # + # Save results + # + out_dev_fname = f'dev_group{gid}.conllu' + write_conllu_file( os.path.join(output_dir, out_dev_fname), ablation_dev ) + if verbose: + print(f' Removed {removed_from_dev} clauses randomly and saved remaining {len(ablation_dev)} dev clauses into {out_dev_fname}.') + out_train_fname = f'train_group{gid}.conllu' + write_conllu_file( os.path.join(output_dir, out_train_fname), ablation_train ) + if verbose: + print(f' Removed {removed_from_train} clauses randomly and saved remaining {len(ablation_train)} train clauses into {out_train_fname}.') + + +# ================================== +# Knockout all top sketches +# ================================== + +def create_knockout_files_from_sketches(test_data_sketches:List[str], test_data:List[List[str]], + whole_data_map:Dict[str,List[Any]], output_dir:str, + dry_run:bool=False, verbose:bool=True) -> List[Tuple[int, int, int]]: + ''' + Creates knockout train, dev and test files based for all test_data_sketches. + For sketch in test_data_sketches, removes its clauses from train and dev, and + creates a subset of test set containing only clauses of this sketch. + Saves resulting files into output_dir, under names 'test_{SKETCH}.conllu', + 'train_{SKETCH}.conllu', 'dev_{SKETCH}.conllu', where SKETCH is the name + of the sketch (made safe via safe_sketch_name(...)). + If dry_run==True, then only imitates saving sketches into files, but does + not write any actual files. Use this option if you want to collect removal + statistics for preparation of control experiments. + Returns tuple: + (sketch name, no of clauses removed from train, no of clauses removed from dev) + ''' + assert len(test_data_sketches) == len(test_data) + control_removed = [] # (sketch name, no of clauses removed from train, no of clauses removed from dev) + for sketch, test_conll in zip(test_data_sketches, test_data): + sketch_id = safe_sketch_name(sketch) + if verbose: + print(sketch) + # + # Subset for test + # + out_test_fname = f'test_{sketch_id}.conllu' + if not dry_run: + write_conllu_file( os.path.join(output_dir, out_test_fname), test_conll ) + if verbose: + print(f' Saved {len(test_conll)} test clauses into {out_test_fname}.') + # + # Extract subset of dev + # + all_conllu = whole_data_map['dev'][1] + all_dicts = whole_data_map['dev'][2] + # Remove sketch and return remaining clauses + # Note: the removal is virtual, actual lists are not affected + preserved_conllu, preserved_dicts, removed_amount_dev = \ + remove_sketches(all_conllu, all_dicts, sketch) + out_dev_fname = f'dev_{sketch_id}.conllu' + if not dry_run: + write_conllu_file( os.path.join(output_dir, out_dev_fname), preserved_conllu ) + if verbose: + print(f' Removed {removed_amount_dev} clauses and saved remaining {len(preserved_conllu)} dev clauses into {out_dev_fname}.') + # + # Extract subset of train + # + # Assume that instances appearing in test set + # have already been removed in previous steps + # (we have a "pure train set") + all_conllu = whole_data_map['train'][1] + all_dicts = whole_data_map['train'][2] + # Remove sketch and return remaining clauses + # Note: the removal is virtual, actual lists are not affected + preserved_conllu, preserved_dicts, removed_amount_train = \ + remove_sketches(all_conllu, all_dicts, sketch) + out_train_fname = f'train_{sketch_id}.conllu' + if not dry_run: + write_conllu_file( os.path.join(output_dir, out_train_fname), preserved_conllu ) + if verbose: + print(f' Removed {removed_amount_train} clauses and saved remaining {len(preserved_conllu)} train clauses into {out_train_fname}.') + # Remember how much we removed (for control experiments) + control_removed.append( (sketch, removed_amount_train, removed_amount_dev) ) + return control_removed + + +def create_random_control_files_from_sketches(test_data_sketches:List[str], test_data:List[List[str]], + whole_data_map:Dict[str,List[Any]], control_removed:List[Tuple[int, int, int]], + output_dir:str, seed:int=5, verbose:bool=True) -> List[Tuple[int, int, int]]: + ''' + Creates random control train, dev and test files for single sketch knockout experiments. + For sketch in test_data_sketches, removes randomly the same amount of clauses that were + removed in knockout dataset preparation (create_knockout_files_from_sketches) from + train and dev files. Group's test set will be the same as in knockout dataset + preparation. + Saves resulting files into output_dir, under names 'test_{SKETCH}.conllu', + 'train_{SKETCH}.conllu', 'dev_{SKETCH}.conllu', where SKETCH is the name + of the sketch (made safe via safe_sketch_name(...)). + ''' + assert len(test_data_sketches) == len(test_data) + rnd = Random() + rnd.seed(seed) + for (sketch, removed_from_train, removed_from_dev) in control_removed: + sketch_id = safe_sketch_name(sketch) + if verbose: + print(sketch) + # + # Subset for test + # + # Collect all test clauses of the sketch + sketch_clauses = [] + for sketch2, test_conll in zip(test_data_sketches, test_data): + if sketch2 == sketch: + sketch_clauses.extend(test_conll) + break + out_test_fname = f'test_{sketch_id}.conllu' + write_conllu_file( os.path.join(output_dir, out_test_fname), sketch_clauses ) + if verbose: + print(f' Saved {len(sketch_clauses)} test clauses into {out_test_fname}.') + # + # Pick randomly same amount of clauses from train & dev + # + train_pure = whole_data_map['train'][1] + dev_full = whole_data_map['dev'][1] + train_sample = rnd.sample(range(0, len(train_pure)+1), removed_from_train) + dev_sample = rnd.sample(range(0, len(dev_full)+1), removed_from_dev) + # + # Remove randomly picked clause instances + # + ablation_train = [] + ablation_dev = [] + for i, conllu in enumerate(train_pure): + if i in train_sample: + continue + ablation_train.append(conllu) + for i, conllu in enumerate(dev_full): + if i in dev_sample: + continue + ablation_dev.append(conllu) + # + # Save results + # + out_dev_fname = f'dev_{sketch_id}.conllu' + write_conllu_file( os.path.join(output_dir, out_dev_fname), ablation_dev ) + if verbose: + print(f' Removed {removed_from_dev} clauses randomly and saved remaining {len(ablation_dev)} dev clauses into {out_dev_fname}.') + out_train_fname = f'train_{sketch_id}.conllu' + write_conllu_file( os.path.join(output_dir, out_train_fname), ablation_train ) + if verbose: + print(f' Removed {removed_from_train} clauses randomly and saved remaining {len(ablation_train)} train clauses into {out_train_fname}.') + +# ============================================= +# ============================================= + +if __name__ == '__main__': + if len(sys.argv) < 2: + raise Exception('(!) Missing input argument: name of the configuration INI file.') + # Try to execute all input files as configurations + for conf_file in sys.argv[1:]: + prepare_sketches_main( conf_file ) diff --git a/02_split_data.py b/02_split_data.py new file mode 100644 index 00000000..24ccd1a1 --- /dev/null +++ b/02_split_data.py @@ -0,0 +1,697 @@ +# +# Creates data splits/joins of the following experiments: +# +# * crossvalidation -- split data into N groups; use each +# unique group only once as a test set; +# * smaller_data -- split data into N groups with increasing +# training set sizes; +# * half_data -- split data into N groups; each group has +# training size half of the available training +# size; +# * single_file -- split single file into train, dev and test. +# (mainly used for debbuging pipeline) +# * full_data -- train and dev sets are joined into one set +# (for final evaluation); +# + +import csv +import os +import os.path +import sys +import random +import warnings +import json + +import conllu +import configparser + +# =============================================================== +# Create data splits/joins required by experiments (MAIN) +# =============================================================== + +def create_train_splits_joins_main( conf_file ): + ''' + Creates data splits (or joins) based on the configuration. + Settings/parameters of the conversion will be read from the given + `conf_file`. + Executes sections in the configuration starting with prefixes + 'split_' and 'join_'. + + For details about the conversion and possible parameters, + see the functions: + * `create_crossvalidation_splits(...)` + * `create_smaller_data_splits(...)` + * `create_half_data_splits(...)` + * `create_single_file_split(...)` + ''' + # Parse configuration file + config = configparser.ConfigParser() + if conf_file is None or not os.path.exists(conf_file): + raise FileNotFoundError("Config file {} does not exist".format(conf_file)) + if len(config.read(conf_file)) != 1: + raise ValueError("File {} is not accessible or is not in valid INI format".format(conf_file)) + section_found = False + for section in config.sections(): + if section.startswith('join_'): + # + # Load joining configuration from the section + # Check validity of the parameters + # + section_found = True + print(f'Performing {section} ...') + if not config.has_option(section, 'input_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "input_dir" parameter.') + input_dir = config[section]['input_dir'] + if not os.path.isdir(input_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "input_dir" value {input_dir!r} in {section!r}.') + concatenate = config[section].get('concatenate', 'train, dev') + target_subsets = concatenate.split(',') + if len(target_subsets) != 2: + raise ValueError(f'Error in {conf_file}: {section}.concatenate must have 2 values, '+\ + f'not {len(target_subsets)}.' ) + train_full_name = config[section].get('train_full', 'train_full.conllu') + # Collect input files + concatenate_files = [] + for subset in target_subsets: + subset = subset.strip() + if subset not in ['train', 'dev', 'test']: + raise ValueError(f'Error in {conf_file}: {section}.concatenate has invalid value {subset}.') + # Find corresponding file from the input dir + for fname in os.listdir(input_dir): + if fname == train_full_name: + continue + if fname.endswith('.conllu') and subset in fname: + concatenate_files.append(os.path.join(input_dir, fname)) + if len(concatenate_files) != 2: + raise ValueError(f'Unable to get 2 concatenateable conllu files from dir {input_dir!r}. '+ + 'Is the directory correct?') + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_dir = config[section]['output_dir'] + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + # + # Concatenate two files (typically train and dev) into one file + # + train_full_file = os.path.join(output_dir, train_full_name) + join_train_dev( train_path=concatenate_files[0], dev_path=concatenate_files[1], + output_path=train_full_file ) + elif section.startswith('split_'): + # + # Load spltting configuration from the section + # Check validity of the parameters + # + section_found = True + print(f'Performing {section} ...') + split_type = config[section].get('split_type', 'crossvalidation') + split_type_clean = (split_type.strip()).lower() + if split_type_clean not in ['crossvalidation', 'smaller_data', 'half_data', 'single_file']: + raise ValueError('(!) Unexpected split type value: {!r}'.format(split_type)) + if split_type_clean == 'single_file': + # ----------------------------------- + # 'single_file_split' + # ----------------------------------- + if not config.has_option(section, 'input_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "input_file" parameter.') + input_file = config[section]['input_file'] + split_ratio = config[section].get('split_ratio', '80, 10, 10') + split_subset_ratios = [] + for subset in split_ratio.split(','): + subset = subset.strip() + if not subset.isnumeric(): + raise ValueError(f'Error in {conf_file}: {section}.split_ratio has invalid value {subset}.') + split_subset_ratios.append( int(subset) ) + if len(split_subset_ratios) != 3: + raise ValueError(f'Error in {conf_file}: {section}.split_ratio has invalid value {subset}: must have 3 ratios.') + seed = config[section].getint('seed', 9) + shuffle = config[section].getboolean('shuffle', False) + subset_size = config[section].getint('subset_size', None) + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_dir = config[section]['output_dir'] + create_single_file_split(input_file, output_dir, + train=split_subset_ratios[0], + dev=split_subset_ratios[1], + test=split_subset_ratios[2], + subset_size=subset_size, + shuffle=shuffle, + seed=seed) + else: + # ----------------------------------- + # 'crossvalidation', + # 'smaller_data', + # 'half_data' + # ----------------------------------- + if not config.has_option(section, 'input_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "input_dir" parameter.') + input_dir = config[section]['input_dir'] + if not os.path.isdir(input_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "input_dir" value {input_dir!r} in {section!r}.') + concatenate = config[section].get('concatenate', 'train, dev') + target_subsets = concatenate.split(',') + if len(target_subsets) != 2: + raise ValueError(f'Error in {conf_file}: {section}.concatenate must have 2 values, '+\ + f'not {len(target_subsets)}.' ) + # Collect input files + concatenate_files = [] + for subset in target_subsets: + subset = subset.strip() + if subset not in ['train', 'dev', 'test']: + raise ValueError(f'Error in {conf_file}: {section}.concatenate has invalid value {subset}.') + # Find corresponding file from the input dir + for fname in os.listdir(input_dir): + if fname == 'train_full.conllu': + continue + if fname.endswith('.conllu') and subset in fname: + concatenate_files.append(os.path.join(input_dir, fname)) + if len(concatenate_files) != 2: + raise ValueError(f'Unable to get 2 concatenateable conllu files from dir {input_dir!r}. '+ + 'Is the directory correct?') + split_file = config[section].get('split_file', 'splits.csv') + block_count = config[section].getint('block_count', 195) + split_count = config[section].getint('split_count', 10) + seed = config[section].getint('seed', 9) + if not config.has_option(section, 'first_output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "first_output_dir" parameter.') + first_output_dir = config[section]['first_output_dir'] + if not config.has_option(section, 'final_output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "final_output_dir" parameter.') + final_output_dir = config[section]['final_output_dir'] + # + # Perform splitting according to the split type + # + if split_type_clean == 'crossvalidation': + create_crossvalidation_splits(concatenate_files[0], concatenate_files[1], first_output_dir, + final_output_dir, split_csv_file=split_file, block_count=block_count, + split_count=split_count, seed=seed) + if split_type_clean == 'smaller_data': + create_smaller_data_splits(concatenate_files[0], concatenate_files[1], first_output_dir, + final_output_dir, split_csv_file=split_file, block_count=block_count, + split_count=split_count, seed=seed) + if split_type_clean == 'half_data': + # Warn about parameters not changable (not implemented) + for param in ['block_count', 'split_count']: + if config.has_option(section, param): + msg = f'In {conf_file}, section {section!r}: parameter {param!r} not changable in half_data experiments.' + warnings.warn( msg ) + create_half_data_splits(concatenate_files[0], concatenate_files[1], first_output_dir, final_output_dir, + split_csv_file=split_file, seed=seed) + if not section_found: + print(f'No section starting with "split_" or "join_" in {conf_file}.') + + +def create_crossvalidation_splits(train_file, dev_file, first_splits_path, crossval_split_path, + split_csv_file='splits.csv', block_count=195, split_count=10, seed=9): + ''' + Creates crossvalidation splits based on given input CONLLU files (`train_file` and `dev_file`). + First, concatenates train_file and dev_file into one file and saves under first_splits_path. + Then splits the concatenated conllu file randomly into `split_count` sub sets. + + The splitting process: 1) the input is first split at the word-level into roughly + equal-sized blocks (`block_count` blocks), 2) the order of blocks is shuffled, and + 3) the shuffling result is split into `split_count` sub sets. + + Saves sub sets into folder `first_splits_path`. + + Finally, rotates over subsets in a way that each sub set belongs to test and dev at once, + and the remaining subsets belong to train. Saves results (crossvalidation subsets) into + folder `crossval_split_path`. + ''' + # Validate inputs + if not os.path.exists(train_file) or not os.path.isfile(train_file): + raise FileNotFoundError(f'(!) Missing or invalid input file {train_file}') + if not os.path.exists(dev_file) or not os.path.isfile(dev_file): + raise FileNotFoundError(f'(!) Missing or invalid input file {dev_file}') + if not os.path.exists(first_splits_path): + os.makedirs(first_splits_path, exist_ok=True) + + # Concatenate train and dev into one file + train_full_file = os.path.join(first_splits_path, 'train_full.conllu') + join_train_dev(train_path=train_file, dev_path=dev_file, output_path=train_full_file) + # Split train_full_file randomly into `split_count` sub sets (roughly equal size blocks) + splitting(input_file=train_full_file, output_dir=first_splits_path, split_file=split_csv_file, + block_count=block_count, split_count=split_count, seed=seed) + + # Divide files to dev - test - train, save into crossvalidation folder + if not os.path.exists(crossval_split_path): + os.makedirs(crossval_split_path, exist_ok=True) + split_conllu_files = [fname for fname in os.listdir(first_splits_path) if fname.endswith('.conllu')] + split_conllu_files.remove('train_full.conllu') + + # CSV file with logging/debugging info + splits_csv = open(os.path.join(crossval_split_path, split_csv_file), 'w', newline='', encoding='utf-8') + csv_writer = csv.writer(splits_csv) + csv_writer.writerow(['split', 'dev', 'test', 'train']) + + # Rotating splits + for i in range(len(split_conllu_files)): + split_key = '{:02d}'.format(i+1) + print(split_key) + + dev = split_conllu_files[i] + if i == len(split_conllu_files) - 1: + test = split_conllu_files[0] + train = split_conllu_files[1:len(split_conllu_files) - 1] + else: + test = split_conllu_files[i+1] + train = split_conllu_files[i + 2:] + split_conllu_files[:i] + + csv_writer.writerow([i+1, dev, test, train]) + + # Create new splits into crossvalidation data folder + with open(os.path.join(crossval_split_path, '{}_dev.conllu'.format(split_key)), 'w', encoding='utf-8') as fout: + fout.write(join_file_contents(first_splits_path, [dev])) + + with open(os.path.join(crossval_split_path, '{}_test.conllu'.format(split_key)), 'w', encoding='utf-8') as fout: + fout.write(join_file_contents(first_splits_path, [test])) + + with open(os.path.join(crossval_split_path, '{}_train.conllu'.format(split_key)), 'w', encoding='utf-8') as fout: + fout.write(join_file_contents(first_splits_path, train)) + + with open(os.path.join(crossval_split_path, '{}_train_all.conllu'.format(split_key)), 'w', encoding='utf-8') as fout: + train_all = train + [dev] + fout.write(join_file_contents(first_splits_path, train_all)) + + splits_csv.close() + + +def create_smaller_data_splits(train_file, dev_file, first_splits_path, smaller_data_split_path, + split_csv_file='splits.csv', block_count=195, split_count=10, seed=9): + ''' + Creates dataset size increasing crossvalidation splits based on given input CONLLU files + (`train_file` and `dev_file`). + First, concatenates train_file and dev_file into one file and saves under first_splits_path. + Then splits the concatenated conllu file randomly into `split_count` sub sets. + + The splitting process: 1) the input is first split at the word-level into roughly + equal-sized blocks (`block_count` blocks), 2) the order of blocks is shuffled, and + 3) the shuffling result is split into `split_count` sub sets. + + Saves initial sub sets into folder `first_splits_path`. + + Creates `split_count` training sets with gradually increasing size: first set contains + only 1 sub set from `first_splits_path`, second set contains 2 sub sets from `first_splits_path`, + and so on. + In the process, these training sets ("train_all" sets) will be further split into train and + dev subsets, with increasing dev set sizes / proportions: 10% if training consist of 1-3 sub sets, + 15% if training consists of 4-6 sub sets, and 20% if training is larger than 6 sub sets. + Finally, saves results (sub sets with increasing sizes) into folder `smaller_data_split_path`. + ''' + if not os.path.exists(train_file) or not os.path.isfile(train_file): + raise FileNotFoundError(f'(!) Missing or invalid input file {train_file}') + if not os.path.exists(dev_file) or not os.path.isfile(dev_file): + raise FileNotFoundError(f'(!) Missing or invalid input file {dev_file}') + if not os.path.exists(first_splits_path): + os.makedirs(first_splits_path, exist_ok=True) + + # Concatenate train and dev into one file + train_full_file = os.path.join(first_splits_path, 'train_full.conllu') + join_train_dev(train_path=train_file, dev_path=dev_file, output_path=train_full_file) + # Split train_full_file randomly into `split_count` sub sets (roughly equal size blocks) + splitting(input_file=train_full_file, output_dir=first_splits_path, split_file=split_csv_file, + block_count=block_count, split_count=split_count, seed=seed) + + if not os.path.exists(smaller_data_split_path): + os.makedirs(smaller_data_split_path, exist_ok=True) + + # read all splits to list: + undivided_splits = [fname for fname in os.listdir(first_splits_path) if fname.endswith('.conllu')] + undivided_splits.remove('train_full.conllu') + + # CSV file with logging/debugging info + splits_csv = open(os.path.join(smaller_data_split_path, split_csv_file), 'w', newline='', encoding='utf-8') + csv_writer = csv.writer(splits_csv) + csv_writer.writerow(['split', 'train']) + + def create_train_dev(training_whole, development_percentage): + divider = len(training_whole) // 100 * (100 - development_percentage) # point between train and dev + training_train = training_whole[:divider] + training_dev = training_whole[divider:] + return training_train, training_dev + + for j in range(len(undivided_splits), 0, -1): + split_key = '{:03d}'.format(j*10) + print(split_key) + + # save splits by sentence-ids + csv_writer.writerow([split_key, undivided_splits]) + + # % of sentences to be divided for development + dev_percentage = 20 + if j in [6, 5, 4]: + dev_percentage = 15 + elif j in [3, 2, 1]: + dev_percentage = 10 + + # Get conllu TokenLists from IDs + data = '' + for filename in undivided_splits: + with open(os.path.join(first_splits_path, filename), "r", encoding="utf-8") as split: + data += split.read() + + training_whole = conllu.parse(data) + + training_train, training_dev = create_train_dev(training_whole, dev_percentage) + + assert len(training_dev) + len(training_train) == len(training_whole) + + with open(os.path.join(smaller_data_split_path, f'{split_key}_train_all.conllu'), 'w', encoding='utf-8') as tr: + tr.write(create_conllu(training_whole)) + with open(os.path.join(smaller_data_split_path, f'{split_key}_train.conllu'), 'w', encoding='utf-8') as tr: + tr.write(create_conllu(training_train)) + with open(os.path.join(smaller_data_split_path, f'{split_key}_dev.conllu'), 'w', encoding='utf-8') as dev: + dev.write(create_conllu(training_dev)) + undivided_splits.pop( random.Random(4).randrange(len(undivided_splits)) ) + random.Random(4).shuffle( undivided_splits ) + + splits_csv.close() + + +def create_half_data_splits(train_file, dev_file, first_splits_path, half_data_split_path, + split_csv_file='splits.csv', seed=9): + ''' + Creates half training set size crossvalidation splits based on given input CONLLU files + (`train_file` and `dev_file`). + First, concatenates `train_file` and `dev_file` into one file and saves under + `first_splits_path`. + + The splitting process: 1) the concatenated conllu file is first split into 194 blocks of + sentences, each block roughly same size in words. + 2) blocks are shuffled and two different train-dev-test splits are formed, with sizes: + 78 blocks for train, 19 for dev and 97 test (one selects first half for train-dev, other + selects the second half for train-dev). + 3) step 2 gets repeated 5 times, resulting in 10 splits at total. + + Finally, saves results (sub sets with halved train sizes) into folder `half_data_split_path`. + ''' + # Validate inputs + if not os.path.exists(train_file) or not os.path.isfile(train_file): + raise FileNotFoundError(f'(!) Missing or invalid input file {train_file}') + if not os.path.exists(dev_file) or not os.path.isfile(dev_file): + raise FileNotFoundError(f'(!) Missing or invalid input file {dev_file}') + if not os.path.exists(first_splits_path): + os.makedirs(first_splits_path, exist_ok=True) + + # Concatenate train and dev into one file + train_full_file = os.path.join(first_splits_path, 'train_full.conllu') + join_train_dev(train_path=train_file, dev_path=dev_file, output_path=train_full_file) + + # Extract all tokens and label them with sentence id-s + all_sentences = [] + with open(train_full_file, 'r', encoding='utf-8') as in_f: + all_sentences = conllu.parse( in_f.read() ) + assert len(all_sentences) > 0 + full_tokenlist = list() + for i, sentence in enumerate(all_sentences): + full_tokenlist.extend([i] * len(sentence)) + + # Splitting given number of sentence blocks, organizing by sentence id-s + # (block_count=194 results in splits of about ~~2000 words) + sentence_blocks = correct_splits(split(full_tokenlist, 194)) + + def extract_sentences(tokenlists, train, dev, test): + train_sents = [tokenlists[no] for block in train for no in block] + test_sents = [tokenlists[no] for block in test for no in block] + dev_sents = [tokenlists[no] for block in dev for no in block] + return train_sents, dev_sents, test_sents + + def save_split_files(train, dev, test, output_path, split_no): + if not os.path.exists(output_path): + os.makedirs(output_path, exist_ok=True) + + with open(os.path.join(output_path, '{}_dev.conllu'.format(split_no)), 'w', encoding='utf-8') as fout: + fout.write(create_conllu(dev)) + + with open(os.path.join(output_path, '{}_test.conllu'.format(split_no)), 'w', encoding='utf-8') as fout: + fout.write(create_conllu(test)) + + with open(os.path.join(output_path, '{}_train.conllu'.format(split_no)), 'w', encoding='utf-8') as fout: + fout.write(create_conllu(train)) + + join_train_dev(train_path=os.path.join(output_path, '{}_train.conllu'.format(split_no)), + dev_path=os.path.join(output_path, '{}_dev.conllu'.format(split_no)), + output_path=os.path.join(output_path, '{}_train_all.conllu'.format(split_no))) + + # CSV file with logging/debugging info + splits_csv = open(os.path.join(half_data_split_path, split_csv_file), 'w', newline='', encoding='utf-8') + csv_writer = csv.writer(splits_csv) + csv_writer.writerow(['split', 'dev', 'test', 'train']) + + # Split corresponding sentence sequence numbers of full train data + rnd = random.Random(seed) + split_counter = 0 + saved_splits = dict() + for i in range(1, 6, 1): + rnd.shuffle(sentence_blocks) + + # 19 blocks for dev + train_1 = sentence_blocks[:78] + dev_1 = sentence_blocks[78:97] + test_1 = sentence_blocks[97:] + + train_2 = sentence_blocks[97:175] + dev_2 = sentence_blocks[175:] + test_2 = sentence_blocks[:97] + + split_key = '{:03d}'.format(split_counter+1) + print(f'{split_key} -- train: #{len(train_1)} sents, dev: #{len(dev_1)} sents, test: #{len(test_1)} sents') + split_counter += 1 + csv_writer.writerow(['split_{}'.format(split_key), dev_1, test_1, train_1]) + train, dev, test = extract_sentences(all_sentences, train_1, dev_1, test_1) + save_split_files(train, dev, test, half_data_split_path, split_key) + + split_key = '{:03d}'.format(split_counter+1) + print(f'{split_key} -- train: #{len(train_2)} sents, dev: #{len(dev_2)} sents, test: #{len(test_2)} sents') + split_counter += 1 + csv_writer.writerow(['split_{}'.format(split_key), dev_2, test_2, train_2]) + train, dev, test = extract_sentences(all_sentences, train_2, dev_2, test_2) + save_split_files(train, dev, test, half_data_split_path, split_key) + + splits_csv.close() + + +def create_single_file_split(input_file, output_dir, train=80, dev=10, test=10, subset_size=None, shuffle=False, seed=9): + ''' + Splits single conllu file (sentence-wise) into train, dev and test sub sets. + Writes sub sets into separate files. + Parameters train, dev, and test correspond to relative sizes of + corresponding sub sets and must add up to 100. + + If shuffle=True, then sentences in the file will be shuffled before + making the split. + + If subset_size is specified, then only the given amount of sentences + will be taken for splitting (applied after shuffling, if shuffling is + enabled). + + This function is used for testing and debugging model training. + ''' + if train+dev+test != 100: + raise ValueError( f'(!) Parameter values train + dev + test ({train} + {dev} + {test}) '+\ + f'do not add up to 100.' ) + all_sentences = [] + with open(input_file, 'r', encoding='utf-8') as in_f: + all_sentences = conllu.parse( in_f.read() ) + if shuffle: + random.Random(seed).shuffle(all_sentences) + if subset_size is not None: + assert isinstance(subset_size, int) + if subset_size > len(all_sentences): + raise ValueError( f'(!) subset_size={subset_size} exceeds the number '+\ + f'of sentences in {input_file!r} ({len(all_sentences)}).' ) + all_sentences = all_sentences[:subset_size] + # Split sentences into train, dev, test + collected_train = [] + collected_dev = [] + collected_test = [] + for i in range(len(all_sentences)): + percentage = int(i*100.0/len(all_sentences)) + if percentage < train: + collected_train.append( all_sentences[i] ) + elif percentage >= train and percentage < train+dev: + collected_dev.append( all_sentences[i] ) + else: + collected_test.append( all_sentences[i] ) + # Sanity check + assert len(collected_train) + len(collected_dev) + len(collected_test) == len(all_sentences) + # Write outputs + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + with open(os.path.join(output_dir, f'train.conllu'), 'w', encoding='utf-8') as tr: + tr.write(create_conllu(collected_train)) + print( f'{len(collected_train)} sentences for training.') + with open(os.path.join(output_dir, f'dev.conllu'), 'w', encoding='utf-8') as dv: + dv.write(create_conllu(collected_dev)) + print( f'{len(collected_dev)} sentences for development.') + with open(os.path.join(output_dir, f'test.conllu'), 'w', encoding='utf-8') as tst: + tst.write(create_conllu(collected_test)) + print( f'{len(collected_test)} sentences for test.') + + +# =============================================================== +# Utilities required by splitting functions +# =============================================================== + +def join_train_dev(train_path, dev_path, output_path): + """ + Concatenates two conllu files into one file. + It is used to concatenate train and dev files. + """ + with open(output_path, 'w', encoding='utf-8') as fout: + train_file = open(train_path, 'r', encoding='utf-8') + dev_file = open(dev_path, 'r', encoding='utf-8') + fout.write(train_file.read()) + fout.write(dev_file.read()) + train_file.close() + dev_file.close() + + +def join_file_contents(input_path: str, filenames: list): + ''' + Reads contents of given (conllu) files from input_path and returns their concatenation. + TODO: merge `join_train_dev` and `join_file_contents` into one function + ''' + data = '' + for filename in filenames: + with open(os.path.join(input_path, filename), 'r', encoding='utf-8') as fin: + data += fin.read() + return data + + +def split(a, n): + """ + Splits list `a` into `n` roughly equal-sized subsets. + If `a` is not exactly divisible by `n`, then finds the + reminder `r` of the division and enlarges sizes of first + `r` subsets by 1. + Returns a generator of the split. + + Examples: + + >>> sp1 = split([1,1,2,2,3,3], 3) + >>> list(sp1) + [[1, 1], [2, 2], [3, 3]] + >>> sp2 = split([1,2,2,3,3,3,4,4,4,4,5,5,5,5,5], 6) + >>> list(sp2) + [[1, 2, 2], [3, 3, 3], [4, 4, 4], [4, 5], [5, 5], [5, 5]] + >> sp3 = split([[1], [2,2], [3,3,3], [4,4,4,4]], 3) + >> list(sp3) + [[[1], [2, 2]], [[3, 3, 3]], [[4, 4, 4, 4]]] + """ + k, m = divmod(len(a), n) + return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)) + + +def correct_splits(splits): + """ + Corrects given splits (lists of lists of sentence id-s) in a way that + each sentence id can only be in one sub list. Motivation: each sentence + should belong to exactly one cross-validation subset. + + The input is a list of lists, where each sub list element is the number + of sentence the word belongs to, e.g [1, 1, 1, 2, 2, 3, 3, 3]. + Assumingly, the division of corpus words into sub lists has been done with + function `split(a, n)`. + Returns list of lists, where each sublist contains unique id-s of sentences + and one sentence id can be in one list only. + + When run with the default settings (Estonian UD train+dev split into 195 + sub sets), then each sub list will have the size of (approx) 2000 words. + """ + splits = list(splits) # convert generator to list + last_sentences_set = set() # for sanity checking + sentence_splits = [] + + for i, subsplit in enumerate(splits): + current_sentences_set = set(subsplit) + + if i == 0: # Has no previous sentence + sentence_splits.append(sorted(list(current_sentences_set))) + last_sentences_set = current_sentences_set + continue + + # If first sentence of is not complete (is in previous list, too), remove it from current list. + last_id = splits[i - 1][-1] + if last_id in current_sentences_set: + current_sentences_set.remove(last_id) + + assert len(current_sentences_set.intersection(last_sentences_set)) == 0 + + sentence_splits.append(sorted(list(current_sentences_set))) # sort to keep correct order + + last_sentences_set = current_sentences_set + + return sentence_splits + + +def create_conllu(tokenlists): + """Serializes given TokenLists into conllu string that can be written to file.""" + conllu_list = [] + for sentence in tokenlists: + conllu_list.append(sentence.serialize()) + return ''.join(conllu_list) + + +def splitting(input_file, output_dir, split_file, block_count=195, split_count=10, seed=0): + """ + Splits `input_file` (a large conllu file) randomly into `split_count` sub sets. + The splitting process: 1) the input is first split at the word-level into roughly + equal-sized blocks (`block_count` blocks), 2) the order of blocks is shuffled, and + 3) the shuffling result is split into `split_count` sub sets. + + The split data will be written into `output_dir`, and each sub set of the split will + be written into separate file named 'split_{split_nr}.conllu'. + Ordered blocks of sentence id-s of each split will be written into `split_file`. + + When run with the default settings (input is Estonian UD TreeBank train+dev, + block_count=195, split=10, and seed=9), then each block will have a size of approx + 2000 words, and each final split contains roughly 2500-3000 sentences. + + :param input_file: input conllu file + :param output_dir: directory for generated splits + :param split_file: name for csv where to save data about splits (ordered sentence IDs) + :param block_count: number of blocks of sentences to split data into + :param split_count: number of final splits + :param seed: seed value to be used for reproducibility + :return: none + """ + + split_file = open(os.path.join(output_dir, split_file), 'w', newline='', encoding='utf-8') + input_file = open(input_file, 'r', encoding='utf-8') + writer = csv.writer(split_file) + + all_sentences = conllu.parse(input_file.read()) + full_tokenlist = list() + for i, sentence in enumerate(all_sentences): + full_tokenlist.extend([i] * len(sentence)) + input_file.close() + + # Splitting given number of sentence blocks, organizing by sentence id-s + # (block_count=195 results in splits of about 2000 words) + sentence_blocks = correct_splits(split(full_tokenlist, block_count)) + random.Random(seed).shuffle(sentence_blocks) + + training_splits = list(split(sentence_blocks, split_count)) # List of sentence blocks divided to 10 + + #training_split_map = dict() + for j, training_split in enumerate(training_splits, 1): + # Get conllu TokenLists corresponding to sentence IDs + training_whole = [] + for block in training_split: + training_whole.extend([all_sentences[id] for id in block]) + with open(os.path.join(output_dir, 'split_{}.conllu'.format(str(j))), 'w', encoding='utf-8') as tr: + tr.write(create_conllu(training_whole)) + writer.writerow([str(j), training_split]) + #training_split_map[j] = training_split + + split_file.close() + #return training_split_map + + +if __name__ == '__main__': + if len(sys.argv) < 2: + raise Exception('(!) Missing input argument: name of the configuration INI file.') + # Try to execute all input files as configurations + for conf_file in sys.argv[1:]: + create_train_splits_joins_main( conf_file ) + diff --git a/02b_make_gaps.py b/02b_make_gaps.py new file mode 100644 index 00000000..08e0a3b0 --- /dev/null +++ b/02b_make_gaps.py @@ -0,0 +1,330 @@ +# +# Gap experiments: delete systematically conllu fields +# form, lemma and upos. +# +# Implemented conllu file modifications: +# +# '01_no_wordforms' -- remove 'form' of every word; +# '02_no_lemmas' -- remove 'lemma' of every word; +# '02_no_pos' -- remove 'upos' and 'xpos' of every word; +# '03_no_adj_noun_lemmas' -- remove 'lemma' if a word is noun or adj; +# '03_no_wordforms_adj_noun_lemmas' -- remove 'form' of every word, and remove 'lemma' if a word is noun or adj; +# '04_no_verb_adpos_lemmas' -- remove 'lemma' if a word is verb or adposition; +# '04_no_wordforms_verb_adpos_lemmas' -- remove 'form' of every word, and remove 'lemma' if a word is verb or adposition; +# '05_only_cg_list_wordforms_lemmas' -- remove 'form' of every word, and remove 'lemma' if a word is not in CG lemmas list; +# '06_no_wordform_lemma_pos_keep_conj' -- remove 'form', 'lemma' and 'upos'/'xpos' of word if word is no conjunction; +# '07_no_wordform_lemma_pos' -- remove 'form', 'lemma' and 'upos'/'xpos' of every word; +# '08_only_wordforms' -- keep only 'form' and remove 'lemma', 'upos'/'xpos', 'feats'; +# '09_only_pos_feats' -- keep only 'upos', 'xpos', and 'feats' and remove 'form' and 'lemma'; +# + +from datetime import datetime +import os, os.path +import sys, re + +import configparser +import warnings + +from conllu import parse_incr + +gap_experiment_names = [ \ + '01_no_wordforms', + '02_no_lemmas', + '02_no_pos', + '03_no_adj_noun_lemmas', + '03_no_wordforms_adj_noun_lemmas', + '04_no_verb_adpos_lemmas', + '04_no_wordforms_verb_adpos_lemmas', + '05_only_cg_list_wordforms_lemmas', + '06_no_wordform_lemma_pos_keep_conj', + '07_no_wordform_lemma_pos', + '08_only_wordforms', + '09_only_pos_feats', +] + +def perform_gap_experiment_modifications( conf_file ): + ''' + Modifies conllu files for gap experiments based on the configuration. + Settings/parameters of modifications will be read from the given + `conf_file`. + Executes sections in the configuration starting with prefix + 'modify_conllu_'. + ''' + # Parse configuration file + config = configparser.ConfigParser() + if conf_file is None or not os.path.exists(conf_file): + raise FileNotFoundError("Config file {} does not exist".format(conf_file)) + if len(config.read(conf_file)) != 1: + raise ValueError("File {} is not accessible or is not in valid INI format".format(conf_file)) + section_found = False + for section in config.sections(): + if section.startswith('modify_conllu_'): + section_found = True + print(f'Performing {section} ...') + # Collect conllu modification parameters + if not config.has_option(section, 'input_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "input_dir" parameter.') + input_dir = config[section]['input_dir'] + if not os.path.isdir(input_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "input_dir" value {input_dir!r} in {section!r}.') + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_dir = config[section]['output_dir'] + if not config.has_option(section, 'gap_experiments'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "gap_experiments" parameter.') + gap_experiments_str = config[section]['gap_experiments'] + collected_gap_experiments = [] + for gap_exp in gap_experiments_str.split(','): + gap_exp_norm = (gap_exp.strip()).lower() + if gap_exp_norm not in gap_experiment_names: + raise ValueError( f'Error in {conf_file} section {section!r} parameter "gap_experiments": '+\ + f'unknown gap experiment name: {gap_exp!r}. \n'+\ + f'Legal names are: {gap_experiment_names}' ) + collected_gap_experiments.append( gap_exp_norm ) + if len(collected_gap_experiments) == 0: + raise ValueError( f'Error in {conf_file} section {section!r} parameter "gap_experiments": '+\ + f'no gap experiment names found from the variable. \n'+\ + f'Legal names are: {gap_experiment_names}' ) + conll_file_pat=None + # Customize sub-experiment pattern (if required) + if config.has_option(section, 'conll_file_pat'): + conll_file_pat = config[section]['conll_file_pat'] + suppress_checks = config[section].getboolean('suppress_checks', False) + # Perform file modifications + modify_directory( input_dir, output_dir, collected_gap_experiments, + conll_file_pat=conll_file_pat, + suppress_checks=suppress_checks ) + if not section_found: + print(f'No section starting with "modify_conllu_" in {conf_file}.') + + +def modify_directory( in_dir, out_dir, gap_experiments, conll_file_pat=None, skip_files=['train_full.conllu'], suppress_checks=False ): + ''' + Iteratively processes all train/test/dev conllu files from in_dir, performing all + modifications listed in gap_experiments. Saves modified files into out_dir. + + If in_dir contains files of multiple sub experiments, conll_file_pat should be a + regular expression that matches generalizes file name over all sub experiments, and + allows to extract sub experiment name (the regex should have a named group 'exp', + capturing the name of the sub experiment). + + Note that for each sub experiment, there can be only 1 train, 1 test and 1 dev + conllu file. If there are multiple candidate files (and suppress_checks==True), + then an exception will be raised. + However, you can use suppress_checks=False to disable the exception rising + behaviour, or alternatively, provide list of skip_files with the names of + files to be skipped from modifications. + ''' + start_time = datetime.now() + # If conll_file_pat is given, try to convert it to regular expression + conll_file_regexp = None + if conll_file_pat is not None: + # Convert file pattern to regular experssion + if not isinstance(conll_file_pat, str): + raise TypeError(f'conll_file_pat must be a string') + try: + conll_file_regexp = re.compile(conll_file_pat) + except Exception as err: + raise ValueError(f'Unable to convert {conll_file_pat!r} to regexp') from err + if 'exp' not in conll_file_regexp.groupindex: + raise ValueError(f'Regexp {conll_file_pat!r} is missing named group "exp"') + # Collect conllu files from input dir + input_files = [] + for fname in sorted(os.listdir( in_dir )): + if (fname.lower()) in skip_files: + continue + if (fname.lower()).endswith('.conllu'): + # Check conllu file pattern (if required) + sub_exp = None + if conll_file_regexp is not None: + m = conll_file_regexp.match( fname ) + if m: + sub_exp = m.group('exp') + else: + # Skip file if it does not match the pattern + continue + # Determine file type: (train_all,) train, dev or test + cur_ftype = None + for f_type in ['train_all', 'train', 'dev', 'test']: + if f_type in fname.lower(): + cur_ftype = f_type + break + if cur_ftype is None: + warnings.warn(f'(!) Could not determine if {fname!r} is train, dev or test file. '+ + 'Skipping file') + continue + fpath = os.path.join( in_dir, fname ) + input_files.append( (cur_ftype, fpath, sub_exp) ) + if len(input_files) == 0: + raise FileNotFoundError(f'(!) No suitable conllu files found from directory {in_dir!r}.') + # Validate that there is an equal number of files in each sub set + if not suppress_checks: + dev_files = [in_file for in_type, in_file, sub_exp in input_files if in_type == 'dev'] + test_files = [in_file for in_type, in_file, sub_exp in input_files if in_type == 'test'] + train_files = [in_file for in_type, in_file, sub_exp in input_files if in_type == 'train'] + if len(dev_files) != len(train_files): + raise Exception( f'(!) Number of collected train files does not match with the number of '+\ + f'dev files. train_files: {train_files!r} vs dev_files: {dev_files!r}. '+\ + 'Please make sure there is equal number of train, dev and test files for '+\ + 'each experiment.' ) + if len(test_files) != len(train_files): + raise Exception( f'(!) Number of collected train files does not match with the number of '+\ + f'test files. train_files: {train_files!r} vs test_files: {test_files!r}. '+\ + 'Please make sure there is equal number of train, dev and test files for '+\ + 'each experiment.' ) + if not os.path.exists(out_dir): + os.makedirs(out_dir, exist_ok=True) + cg_lemmas_set = None + if '05_only_cg_list_wordforms_lemmas' in gap_experiments: + cg_lemmas_set = load_cg_list() + print(f' Loaded {len(cg_lemmas_set)} EstCG lemmas.') + for exp_name in gap_experiments: + exp_name = exp_name.lower() + for in_type, in_file, sub_exp in input_files: + subset_name = '' + # Try to fetch subset name, if corresponding pattern is defined + if conll_file_regexp is not None: + in_path, fname = os.path.split(in_file) + m = conll_file_regexp.match(fname) + if m: + subset_name = m.group('exp') + if not subset_name.endswith('_'): + subset_name = subset_name + '_' + out_file_temp = os.path.join( out_dir, f'{exp_name}_{subset_name}{in_type}_temp.conllu' ) + out_file_final = os.path.join( out_dir, f'{exp_name}_{subset_name}{in_type}.conllu' ) + if exp_name == '01_no_wordforms': + # gap_experiments/experiment_x/ -- kustuta kõik sõnavormid + modify_file(in_file, out_file_final, remove_fields=['form'], token_picker=None, remove_meta=False) + elif exp_name == '02_no_lemmas': + # gap_experiments/experiment_2/ -- kustuta kõik lemmad + modify_file(in_file, out_file_final, remove_fields=['lemma'], token_picker=None, remove_meta=False) + elif exp_name == '02_no_pos': + # gap_experiments/experiment_x/ -- kustuta kõik sõnaliigimärgendid + modify_file(in_file, out_file_final, remove_fields=['upos', 'xpos'], token_picker=None, remove_meta=False) + elif exp_name == '03_no_adj_noun_lemmas': + # gap_experiments/experiment_3/ -- nimisõnade ja omadussõnade lemmad kustutatud + modify_file(in_file, out_file_final, remove_fields=['lemma'], token_picker=lambda x: x['xpos'] in ['S', 'A']) + elif exp_name == '03_no_wordforms_adj_noun_lemmas': + # gap_experiments/experiment_3_2/ -- kõik sõnavormid ja nimisõnade, omadussõnade lemmad kustutatud + modify_file(in_file, out_file_temp, remove_fields=['form'], token_picker=None) + modify_file(out_file_temp, out_file_final, remove_fields=['lemma'], token_picker=lambda x: x['xpos'] in ['S', 'A']) + os.remove(out_file_temp) + elif exp_name == '04_no_verb_adpos_lemmas': + # gap_experiments/experiment_4/ -- verbide ja kaassõnade lemmad kustutatud + modify_file(in_file, out_file_final, remove_fields=['lemma'], token_picker=lambda x: x['xpos'] in ['V', 'K']) + elif exp_name == '04_no_wordforms_verb_adpos_lemmas': + # gap_experiments/experiment_4_2/ -- kõik sõnavormid + verbide ja kaassõnade lemmad kustutatud + modify_file(in_file, out_file_temp, remove_fields=['form'], token_picker=None) + modify_file(out_file_temp, out_file_final, remove_fields=['lemma'], token_picker=lambda x: x['xpos'] in ['V', 'K']) + os.remove(out_file_temp) + elif exp_name == '05_only_cg_list_wordforms_lemmas': + # gap_experiments/experiment_5/ -- kõik sõnavormid + lemmad kustutatud, kui lemma pole CG listis + modify_file(in_file, out_file_temp, remove_fields=['form'], token_picker=None) + modify_file(out_file_temp, out_file_final, remove_fields=['lemma'], token_picker=lambda x: not cg_lemma_match(x, cg_lemmas_set)) + os.remove(out_file_temp) + elif exp_name == '06_no_wordform_lemma_pos_keep_conj': + # gap_experiments/experiment_conjunction/ -- kustuta sõnavorm & lemma & upos, kui pole tegemist sidesõnaga + modify_file(in_file, out_file_final, remove_fields=['form', 'lemma', 'upos', 'xpos'], token_picker=lambda x: x['xpos'] not in ['J']) + elif exp_name == '07_no_wordform_lemma_pos': + # POS+MORPH experiments -- kustuta igalt poolt sõnavorm & lemma & upos (maksimaalne kustutamine) + modify_file(in_file, out_file_final, remove_fields=['form', 'lemma', 'upos', 'xpos'], token_picker=None) + elif exp_name == '08_only_wordforms': + # Keep only 'form' and remove 'lemma', 'upos'/'xpos', 'feats'; + modify_file(in_file, out_file_final, remove_fields=['lemma', 'upos', 'xpos', 'feats'], token_picker=None) + elif exp_name == '09_only_pos_feats': + # Keep only 'upos', 'xpos', and 'feats' and remove 'form' and 'lemma'; + modify_file(in_file, out_file_final, remove_fields=['form', 'lemma'], token_picker=None) + else: + warnings.warn(f'(!) Unknown gap experiment {exp_name!r}. Skipping that modification step.') + print(f'Total time elapsed: {datetime.now()-start_time}') + + +def load_cg_list( in_file='background_data/visl_lemmas.txt', clean_lemmas=True, return_set=True ): + ''' + Loads list of lemmas that were used in the EstCG syntax from in_file. + Cleans the list: removes lemmas that are likely regular expression patterns, + and if clean_lemmas==True, also deletes '=' and '_' symbols inside lemmas. + Returns a list of lemmas or set of lemmas if return_set==True. + ''' + if not os.path.exists(in_file): + raise FileNotFoundError(f'(!) Unable to find cg list file {in_file!r}') + visl_regexes = [] + visl_lemmas_clean = [] + with open(in_file, 'r', encoding='utf-8') as fin: + lemmas_raw_all = fin.readlines() + lemmas_raw = [l.strip() for l in lemmas_raw_all] + for l in lemmas_raw: + if re.search('[^-a-züõöäÜÕÖÄšžŽŠA-Z=_]', l): + visl_regexes.append(l) + else: + if clean_lemmas: + # Remove _ & = + l = (l.replace('_', '')).replace('=', '') + visl_lemmas_clean.append(l) + return visl_lemmas_clean if not return_set else set(visl_lemmas_clean) + + +def cg_lemma_match( candidate_token, cg_lemmas_set ): + ''' + Determines if lemma of the candidate_token is in cg_lemmas_set. + If the token is verb and ends with 'ma', then also tries to + find lemma with stripped-off 'ma' ending from the cg_lemmas_set + (because verb lemmas in cg_lemmas_set may or may not have the + 'ma' ending -- the listing is not systematic). + ''' + candidate_lemma = candidate_token['lemma'] + candidate_pos = candidate_token['xpos'] + if candidate_lemma.endswith('ma') and candidate_pos == 'V': + # If candidate is verb, try to match without 'ma' ending + candidate_lemma_stripped = re.sub('ma$', '', candidate_lemma) + return candidate_lemma in cg_lemmas_set or \ + candidate_lemma_stripped in cg_lemmas_set + else: + return candidate_lemma in cg_lemmas_set + + +def modify_file( in_file, output_file, remove_fields=[], token_picker=None, remove_meta=False ): + ''' + Modifies conllu in_file by removing specified fields. Optionally, if a token_picker (lambda + callable that should return a boolean) is defined, then removes fields only from tokens that + satisfy the token_picker. + Saves modified file into output_file. + ''' + if token_picker is not None: + if not callable(token_picker): + raise ValueError('(!) token_picker should be a function (callable) that '+\ + 'can be used for picking tokens for deletion.') + with open(in_file, 'r', encoding='utf-8') as conllu_file: + with open(output_file, 'w', encoding='utf-8') as fout: + for sentence in parse_incr(conllu_file): + if remove_meta: + sentence.metadata.pop('text') + sentence.metadata.pop('sent_id') + if 'newdoc id' in sentence.metadata: + sentence.metadata.pop('newdoc id') + for i, token in enumerate(sentence): + if token_picker is not None: + if not token_picker(token): + # Skip this token + continue + for key in token.keys(): + if key in remove_fields: + if key not in ['form', 'upos', 'xpos']: + token[key] = '_' + else: + # TODO: next time, use a token that does + # not appear among punctuation tokens of + # the corpus + token[key] = '---' + fout.write(sentence.serialize()) + +# ======================================================================== + +if __name__ == '__main__': + #print ( load_cg_list() ) + + if len(sys.argv) < 2: + raise Exception('(!) Missing input argument: name of the configuration INI file.') + # Try to execute all input files as configurations + for conf_file in sys.argv[1:]: + perform_gap_experiment_modifications( conf_file ) diff --git a/03_predict_stanza_morph.py b/03_predict_stanza_morph.py new file mode 100644 index 00000000..02d29e69 --- /dev/null +++ b/03_predict_stanza_morph.py @@ -0,0 +1,232 @@ +# +# Executes stanza's morphology models for prediction based on +# the given configuration. +# +# Supported models: +# * stanza's tagger (POS/morphological features tagger); +# * stanza's lemmatizer; +# Implemented settings: +# * full_data +# +import os +import os.path +import sys +import re +import argparse +from datetime import datetime + +from conllu import parse_incr +from conllu.serializer import serialize_field + +from stanza import Pipeline +from stanza.models.common.doc import Document +from stanza.utils.conll import CoNLL + +import configparser + +# =============================================================== +# Use Stanza for tagging morphological features (MAIN) +# =============================================================== + +def predict_stanza_morph_main( conf_file, dry_run=False ): + ''' + Executes stanza's morphological tagging models for predictions + based on the given configuration. + Settings/parameters of the training/prediction will be read from + the given `conf_file`. + Executes sections in the configuration starting with prefix + 'predict_morph_feats_stanza_'. + ''' + # Parse configuration file + config = configparser.ConfigParser() + if conf_file is None or not os.path.exists(conf_file): + raise FileNotFoundError("Config file {} does not exist".format(conf_file)) + if len(config.read(conf_file)) != 1: + raise ValueError("File {} is not accessible or is not in valid INI format".format(conf_file)) + section_found = False + for section in config.sections(): + # ------------------------------------------------------------ + # predict using stanza lemmatizer & morphological tagger + # ------------------------------------------------------------ + if section.startswith('predict_morph_feats_stanza_'): + section_found = True + print(f'Running {section} ...') + # ------------------------------------------ + # 'full_data' + # ------------------------------------------ + # skip_train: do not predict on train files + skip_train = config[section].getboolean('skip_train', False) + # train_file with path + train_file = None + if not skip_train: + if not config.has_option(section, 'train_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "train_file" parameter.') + train_file = config[section]['train_file'] + if not os.path.isfile(train_file): + raise FileNotFoundError(f'Error in {conf_file}: invalid "train_file" value {train_file!r} in {section!r}.') + # test_file with path + if not config.has_option(section, 'test_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "test_file" parameter.') + test_file = config[section]['test_file'] + if not os.path.isfile(test_file): + raise FileNotFoundError(f'Error in {conf_file}: invalid "test_file" value {test_file!r} in {section!r}.') + # output_dir + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_dir = config[section]['output_dir'] + dry_run = config[section].getboolean('dry_run', dry_run) + use_gpu = config[section].getboolean('use_gpu', False) + output_prefix = config[section].get('output_file_prefix', 'morph_predicted_') + lang = config[section].get('lang', 'et') + # predict with a single model: get models file with paths + lemmatizer_model_file = config[section].get('lemmatizer_model_file', None) + morph_tagger_model_file = config[section].get('morph_tagger_model_file', None) + # or, alternatively, download stanza's models + download_models = config[section].getboolean('download_models', False) + if lemmatizer_model_file is None and morph_tagger_model_file is None and not download_models: + raise ValueError(f'Error in {conf_file}: section {section!r} is missing parameters '+\ + '"lemmatizer_model_file" and "morph_tagger_model_file". At least one '+\ + 'of these parameters must be defined.') + if lemmatizer_model_file is not None and not os.path.isfile( lemmatizer_model_file ): + raise FileNotFoundError(f'Error in {conf_file}: invalid "lemmatizer_model_file" value '+\ + f'{lemmatizer_model_file!r} in {section!r}. ') + if morph_tagger_model_file is not None and not os.path.isfile( morph_tagger_model_file ): + raise FileNotFoundError(f'Error in {conf_file}: invalid "morph_tagger_model_file" value '+\ + f'{morph_tagger_model_file!r} in {section!r}. ') + if not dry_run: + start_time = datetime.now() + # Predict on train + if not skip_train: + train_path, train_file_name = os.path.split(train_file) + train_output = os.path.join(output_dir, f'{output_prefix}{train_file_name}') + predict_with_stanza_pipeline(train_file, lemmatizer_model_file, morph_tagger_model_file, + train_output, download_models=download_models, lang=lang, + use_gpu=use_gpu) + # Predict on test + test_path, test_file_name = os.path.split(test_file) + test_output = os.path.join(output_dir, f'{output_prefix}{test_file_name}') + predict_with_stanza_pipeline(test_file, lemmatizer_model_file, morph_tagger_model_file, + test_output, download_models=download_models, lang=lang, + use_gpu=use_gpu) + print() + print(f'Total time elapsed: {datetime.now()-start_time}') + + if not section_found: + print(f'No section starting with "predict_morph_feats_stanza_" in {conf_file}.') + + +# ======================================================================== +# Stanza predictions: use pipeline to predict both lemmas & morph feats +# ======================================================================== + +def create_stanza_document(input_path, mask_morph_feats=True): + """ + Loads sentences from given CONLLU file and creates stanza's Document. + Document will be pretagged: it contains id, text, lemma, upos, xpos, + feats values loaded from the CONLLU file, and empty values in place + of other conllu fields. + Returns loaded Document. + + :param input_path: path to conllu file to be loaded + :param mask_morph_feats: if set (default), then masks all + morphological features in the document with '---'. + :return: stanza Document + """ + with open(input_path, 'r', encoding='utf-8') as conllu_file: + data = [] + for tokenlist in parse_incr(conllu_file): + pretagged_sent = [] + for word in tokenlist: + if not isinstance(word['id'], int): + # Because stanza cannot handle ellipsis (considers it + # a multi-word), we leave ellipsis word out + continue + word_feats = { + 'id': word['id'], + 'text': word['form'], + 'lemma': word['lemma'], + 'upos': word['upos'], + 'xpos': word['xpos'], + 'feats': serialize_field( word['feats'] ), + # Carry over gold standard head & deprel + 'head': word['head'], + 'deprel': word['deprel'] + } + if mask_morph_feats: + word_feats['lemma'] = '_' + word_feats['upos'] = '---' + word_feats['xpos'] = '---' + word_feats['feats'] = serialize_field( '_' ) + pretagged_sent.append(word_feats) + data.append(pretagged_sent) + # create Document-obj from sentences of the pretagged file + return Document(data) + +def predict_with_stanza_pipeline(input_path, lemmatizer_model_path, morph_tagger_model_path, + output_path, download_models=False, lang='et', use_gpu=False): + ''' + Applies stanza's lemmatizer_model/morph_tagger_model on given input CONLLU file to + get depparse predictions. Alternatively, uses lemmatizer/morph_tagger_model downloaded + from stanza's resources for predictions. + Saves predictions to output CONLLU file. + + :param input_path: path to conllu file to be annotated + :param lemmatizer_model_path: path to lemmatizer model to be used for predictions + :param morph_tagger_model_path: path to tagger model to be used for predictions + :param output_path: path to output conllu file + :param download_models: whether models should be downloaded instead of using existing ones + ''' + if morph_tagger_model_path is None and lemmatizer_model_path is None and not download_models: + raise ValueError('(!) At least one of the model paths lemmatizer_model_path and '+\ + 'morph_tagger_model_path must be provided.') + elif (morph_tagger_model_path is not None or lemmatizer_model_path is not None) and download_models: + raise ValueError('(!) Conflicting parameters: cannot use morph_tagger_model_path or '+\ + 'lemmatizer_model_path if download_models is switched on. ') + processors = [] + if morph_tagger_model_path is not None: + processors.append('pos') + if lemmatizer_model_path is not None: + processors.append('lemma') + if download_models: + processors = ['pos', 'lemma'] + config = { + 'processors': 'tokenize,'+(','.join(processors)), # Comma-separated list of processors to use + 'lang': lang, # Language code for the language to build the Pipeline in + 'use_gpu': use_gpu + } + if not download_models: + # Use existing models, do not download anything + config['download_method'] = 0 # NONE won't download anything + if morph_tagger_model_path is not None: + config['pos_model_path'] = morph_tagger_model_path + if lemmatizer_model_path is not None: + config['lemma_model_path'] = lemmatizer_model_path + # Note: "tokenize" is listed in 'processors' because its is + # mandatory for lemma/pos. However, we don't want to use it. + # Use pretokenized text as input and disable tokenization + config['tokenize_pretokenized'] = True + nlp = Pipeline(**config) + doc = create_stanza_document(input_path, mask_morph_feats=True) + nlp(doc) + output_dir, output_fname = os.path.split(output_path) + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + write_stanza_doc_to_conll(doc, output_path) + +def write_stanza_doc_to_conll(doc, output_path): + '''Writes given stanza Document to CoNLLU format output file.''' + conll = CoNLL.convert_dict(doc.to_dict()) + with open(output_path, 'w', encoding='utf-8') as fout: + for sentence in conll: + for word in sentence: + fout.write('\t'.join(word) + '\n') + fout.write('\n') + fout.write('\n' * 2) + +# ======================================================================== + +if __name__ == '__main__': + if len(sys.argv) < 2: + raise Exception('(!) Missing input argument: name of the configuration INI file.') + conf_file = sys.argv[1] + predict_stanza_morph_main( conf_file ) \ No newline at end of file diff --git a/03_train_stanza.py b/03_train_stanza.py new file mode 100644 index 00000000..355afb54 --- /dev/null +++ b/03_train_stanza.py @@ -0,0 +1,383 @@ +# +# Trains models according to experiment settings. +# Supported models: +# * stanza syntax (depparse) +# Implemented settings: +# * full_data +# * multi_experiment (general) +# * crossvalidation +# * half_data +# * smaller_data +# +import os +import os.path +import sys +import re +import argparse +from datetime import datetime + +from stanza.models.parser import main as stanza_main + +from stanza.utils.conll18_ud_eval import load_conllu_file as stanza_load_conllu_file +from stanza.utils.conll18_ud_eval import evaluate +from stanza.utils.conll18_ud_eval import build_evaluation_table + +import configparser + +# =============================================================== +# Train Stanza for syntax (MAIN) +# =============================================================== + +def train_models_main( conf_file, subexp=None, dry_run=False ): + ''' + Trains models based on the configuration. + Settings/parameters of the training will be read from the given + `conf_file`. + Executes sections in the configuration starting with prefix + 'train_stanza_'. + + Optinally, if `subexp` is defined, then trains and evaluates only + that sub-experiment and skips all other sub-experiments (in + crossvalidation, smaller_data and half_data experiments). + ''' + # Parse configuration file + config = configparser.ConfigParser() + if conf_file is None or not os.path.exists(conf_file): + raise FileNotFoundError("Config file {} does not exist".format(conf_file)) + if len(config.read(conf_file)) != 1: + raise ValueError("File {} is not accessible or is not in valid INI format".format(conf_file)) + section_found = False + for section in config.sections(): + # ------------------------------------------ + # s t a n z a + # ------------------------------------------ + if section.startswith('train_stanza_'): + section_found = True + subexp_str = '' if subexp is None else f' ({subexp})' + print(f'Running {section}{subexp_str} ...') + experiment_type = config[section].get('experiment_type', 'full_data') + experiment_type_clean = (experiment_type.strip()).lower() + if experiment_type_clean not in ['full_data', 'crossvalidation', 'half_data', 'smaller_data', 'multi_experiment']: + raise ValueError('(!) Unexpected experiment_type value: {!r}'.format(experiment_type)) + if experiment_type_clean == 'full_data': + # ------------------------------------------ + # 'full_data' + # ------------------------------------------ + # train_file with path + if not config.has_option(section, 'train_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "train_file" parameter.') + train_file = config[section]['train_file'] + if not os.path.isfile(train_file): + raise FileNotFoundError(f'Error in {conf_file}: invalid "train_file" value {train_file!r} in {section!r}.') + # eval_file with path + if not config.has_option(section, 'eval_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "eval_file" parameter.') + eval_file = config[section]['eval_file'] + if not os.path.isfile(eval_file): + raise FileNotFoundError(f'Error in {conf_file}: invalid "eval_file" value {eval_file!r} in {section!r}.') + # output_dir + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_model_dir = config[section]['output_dir'] + # other parameters + output_model_file = config[section].get('model_file', ' model.pt') + extra_args = config[section].get('args', '') + dry_run = config[section].getboolean('dry_run', dry_run) + predict_after = config[section].getboolean('predict_after', False) + parser = 'stanza' + eval_path, eval_file_name = os.path.split(eval_file) + output_file = os.path.join( output_model_dir, 'train_output_'+eval_file_name ) + output_eval_score_file = 'eval_'+(eval_file_name.replace('.conllu', '_score.txt')) + output_eval_score_file = os.path.join(output_model_dir, output_eval_score_file) + + print(f'Training {parser} parser with {train_file}, {eval_file} --> {output_file}, '+ + f'{output_model_dir}/{output_model_file}') + print(f'Parameters: {extra_args}') + + train_stanza( train_file, eval_file, output_model_dir, output_model_file, \ + output_file, args=extra_args, dry_run=dry_run ) + if predict_after: + predict_eval_with_stanza(eval_file, output_model_dir, output_model_file, output_file, + dry_run=dry_run) + run_conll18_ud_eval(eval_file, output_file, return_type='las_f1', + save_results_file=output_eval_score_file, dry_run=dry_run) + elif experiment_type_clean in ['crossvalidation', 'half_data', 'smaller_data', 'multi_experiment']: + # ------------------------------------------ + # 'multi_experiment' (general) + # 'crossvalidation' + # 'half_data' + # 'smaller_data' + # ------------------------------------------ + # input_dir + if not config.has_option(section, 'input_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "input_dir" parameter.') + input_dir = config[section]['input_dir'] + if not os.path.isdir(input_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "input_dir" value {input_dir!r} in {section!r}.') + # output_dir + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_model_dir = config[section]['output_dir'] + extra_args = config[section].get('args', '') + dry_run = config[section].getboolean('dry_run', dry_run) + predict_after = config[section].getboolean('predict_after', False) + parser = 'stanza' + # Patterns for capturing names of sub-experiment files + train_file_pat = r'(?P\d+)_train.conllu' + dev_file_pat = r'(?P\d+)_dev.conllu' + test_file_pat = r'(?P\d+)_dev.conllu' + # Override sub-experiment patterns (if required) + if config.has_option(section, 'train_file_pat'): + train_file_pat = config[section]['train_file_pat'] + if config.has_option(section, 'dev_file_pat'): + dev_file_pat = config[section]['dev_file_pat'] + if config.has_option(section, 'test_file_pat'): + test_file_pat = config[section]['test_file_pat'] + # Launch experiments + bulk_train( input_dir, train_file_pat, dev_file_pat, test_file_pat, + output_model_dir, subexp=subexp, parser=parser, + predict_after=predict_after, args=extra_args, + dry_run=dry_run ) + if not section_found: + print(f'No section starting with "train_stanza_" in {conf_file}.') + + +# ======================================================================== +# Bulk training (for crossvalidation, half-data etc.) +# ======================================================================== + +def bulk_train( data_folder, train_file_pat, dev_file_pat, test_file_pat, output_path, + subexp=None, parser='stanza', predict_after=True, args='', dry_run=False ): + ''' + Trains models of multiple sub-experiments on (train/dev) files from `data_folder`. + Optinally, also evaluates each sub-experiment model (on test files). + Outputs trained models and evaluation results to `output_path`. + + Parameters `train_file_pat`, `dev_file_pat` and `test_file_pat` must be strings + compilable into regexp patterns that can be used to detect data sets of all sub- + experiments. + Each of these patterns must have the named group 'exp', indicating part of the + pattern matching sub-experiment name. + + Use parameter `subexp` to restrict training and evaluation only to a single + sub-experiment instead of performing all sub-experiments. + This is useful when multiple instances of the Python are launched for + parallelization. + + Additional (training) parameters for parser can be provided via `args`. + ''' + # Validate input arguments + supported_parsers = ['stanza'] + if not isinstance(parser, str) or parser.lower() not in supported_parsers: + raise ValueError( f'(!) Unexpected parser: {parser!r}. '+\ + f'Supported parsers: {supported_parsers!r}' ) + parser = parser.lower() + if not os.path.exists(data_folder) or not os.path.isdir(data_folder): + raise Exception(f'(!) Missing or invalid input directory {data_folder!r}') + file_patterns = [ ['train', train_file_pat], + ['dev', dev_file_pat], + ['test', test_file_pat] ] + # Convert file patterns to regular experssions + regexp_file_patterns = [] + for subset, file_pat in file_patterns: + if not isinstance(file_pat, str): + raise TypeError(f'{subset}_file_pat must be a string') + regexp = None + try: + regexp = re.compile(file_pat) + except Exception as err: + raise ValueError(f'Unable to convert {file_pat!r} to regexp') from err + if 'exp' not in regexp.groupindex: + raise ValueError(f'Regexp {file_pat!r} is missing named group "exp"') + regexp_file_patterns.append( [subset, regexp] ) + # Collect experiment input files + experiment_data = { 'train':[], 'dev':[], 'test':[], 'numbers':[] } + for fname in sorted( os.listdir(data_folder) ): + for [subset, regex_file_pat] in regexp_file_patterns: + m = regex_file_pat.match(fname) + if m: + if not (fname.lower()).endswith('.conllu'): + raise Exception( f'(!) invalid file {fname}: {subset} file '+\ + 'must have extension .conllu' ) + fpath = os.path.join(data_folder, fname) + experiment_data[subset].append( fpath ) + no = m.group('exp') + if no not in experiment_data['numbers']: + experiment_data['numbers'].append(no) + # Validate that we have all required files + for [subset, file_pat] in file_patterns: + if len(experiment_data[subset]) == 0: + raise Exception(f'Unable to find any {subset} files '+\ + f'matching {file_pat!r} in dir {data_folder!r}.') + if len(experiment_data[subset]) != len(experiment_data['numbers']): + no1 = len(experiment_data[subset]) + no2 = len(experiment_data['numbers']) + raise Exception(f'Number of {subset} files ({no1}) does not match '+\ + f'the number of experiments ({no2}).') + if subexp is not None: + if subexp not in experiment_data['numbers']: + raise ValueError( f'(!) sub-experiment {subexp!r} not in collected '+\ + f'experiment names: {experiment_data["numbers"]}.' ) + # Launch experiments + start_time = datetime.now() + for i in range( len(experiment_data['numbers']) ): + exp_no = experiment_data['numbers'][i] + train_file = experiment_data['train'][i] + dev_file = experiment_data['dev'][i] + test_file = experiment_data['test'][i] + if subexp is not None and exp_no != subexp: + # Skip other experiments + continue + output_model_dir = output_path + output_model_file = f"model_{exp_no}.pt" + test_path, test_file_name = os.path.split(test_file) + output_file = os.path.join( output_model_dir, 'train_output_'+test_file_name ) + output_eval_score_file = 'eval_'+(test_file_name.replace('.conllu', '_score.txt')) + output_eval_score_file = os.path.join(output_model_dir, output_eval_score_file) + + print('='*(len(exp_no)*2)) + print(f' {exp_no}') + print('='*(len(exp_no)*2)) + print(f'Training {parser} parser with {train_file}, {dev_file}, {test_file} --> '+ + f'{output_file}, {output_model_dir}/{output_model_file}') + print(f'Parameters: {args}') + + if parser == 'stanza': + train_stanza( train_file, dev_file, output_model_dir, output_model_file, \ + output_file, args=args, dry_run=dry_run ) + if predict_after: + predict_eval_with_stanza(test_file, output_model_dir, output_model_file, output_file, + dry_run=dry_run) + las = run_conll18_ud_eval(test_file, output_file, return_type='las_f1', + save_results_file=output_eval_score_file, + dry_run=dry_run) + #print(f'Best model eval score: {las}') + print() + print() + print(f'Total time elapsed: {datetime.now()-start_time}') + +# ======================================================================== +# Stanza interface: training models and predicting on eval set +# ======================================================================== + +def train_stanza(train_file, eval_file, output_model_dir, output_model_file, output_file, + lang='et', treebank='et_edt', args='', dry_run=False): + ''' + Trains single stanza model on `train_file` using `eval_file` for parameter tuning + and model evaluation. + + Note: in previous experiments, in addition to `eval_file`, a separate parameter + `gold_file` was defined. Here, we assume that `gold_file` == `eval_file`, so only + parameter `eval_file` is required. + + Uses parameters of stanza parser: + --save_dir : Root dir for saving models (output_model_dir) + --save_name : File name to save the model (output_model_file) + --train_file : Input file for data loader. + --eval_file : Input file for data loader. + --no_pretrain : Turn off pretrained embeddings. + --output_file : Output CoNLL-U file. + --gold_file : Output CoNLL-U file. (gold labels for eval_file) + --lang : Language + --shorthand : Treebank shorthand + --mode : choices=['train', 'predict'] + --batch_size : default=5000 + ''' + if not os.path.exists(output_model_dir): + os.makedirs(output_model_dir, exist_ok=True) + stanza_args = \ + f'--save_dir {output_model_dir} --save_name {output_model_file} --train_file {train_file} --eval_file {eval_file} --no_pretrain '+\ + f'--output_file {output_file} --gold_file {eval_file} --lang {lang} --shorthand {treebank} --mode train {args}' + if dry_run: + return + stanza_main( args=stanza_args.split() ) + + +def predict_eval_with_stanza(eval_file, output_model_dir, output_model_file, output_file, + lang='et', treebank='et_edt', args='', dry_run=False): + ''' + Uses existing stanza's model `output_model_file` to predict labels for `eval_file`. + + Note: in previous experiments, in addition to `eval_file`, a separate parameter + `gold_file` was defined. Here, we assume that `gold_file` == `eval_file`, so only + parameter `eval_file` is required. + + Uses parameters of stanza parser: + --save_dir : Root dir for saving models (output_model_dir) + --save_name : File name to save the model (output_model_file) + --eval_file : Input file for data loader. + --no_pretrain : Turn off pretrained embeddings. + --output_file : Output CoNLL-U file. + --gold_file : Output CoNLL-U file. (gold labels for eval_file) + --lang : Language + --shorthand : Treebank shorthand + --mode : choices=['train', 'predict'] + ''' + if not os.path.exists(output_model_dir): + raise ValueError(f'(!) Non-existent model path: {output_model_dir}/{output_model_file}') + stanza_args = \ + f'--save_dir {output_model_dir} --save_name {output_model_file} --no_pretrain --eval_file {eval_file} '+\ + f'--output_file {output_file} --gold_file {eval_file} --lang {lang} --shorthand {treebank} --mode predict '+\ + f'{args}' + if dry_run: + return + stanza_main( args=stanza_args.split() ) + + +# ======================================================================== +# Stanza interface: evaluation +# ======================================================================== + +def run_conll18_ud_eval(gold_file, system_file, return_type='las_f1', save_results_file=None, dry_run=False): + ''' + Calculates CONLL-2018 evaluation scores based on given `gold_file` and `system_file`. + If return_type == 'las_f1' (default), then returns LAS score (as string). + If return_type == 'table' (default), then returns CONLL-2018 evaluation table (as string). + Optionally, if `save_results_file` is provided, saves returned value into given file. + ''' + if not isinstance(return_type, str) or \ + return_type.lower() not in ['las_f1', 'table']: + raise ValueError(f'(!) Unexpected return type {return_type!r}') + if dry_run: + return None + # Evaluate + # The following code is based on: + # https://github.com/stanfordnlp/stanza/blob/main/stanza/utils/conll18_ud_eval.py#L658-L673 + treebank_type = {} + treebank_type['no_gapping'] = 0 + treebank_type['no_shared_parents_in_coordination'] = 0 + treebank_type['no_shared_dependents_in_coordination'] = 0 + treebank_type['no_control'] = 0 + treebank_type['no_external_arguments_of_relative_clauses'] = 0 + treebank_type['no_case_info'] = 0 + treebank_type['no_empty_nodes'] = False + treebank_type['multiple_roots_okay'] = False + # Load CoNLL-U files + gold_ud = stanza_load_conllu_file(gold_file, treebank_type) + system_ud = stanza_load_conllu_file(system_file, treebank_type) + eval_result = evaluate(gold_ud, system_ud) + # Format results + if return_type.lower() == 'las_f1': + # result is LAS f1 score + result = f'{(100*eval_result["LAS"].f1):.2f}' + elif return_type.lower() == 'table': + # result is a table of scores + result = build_evaluation_table(eval_result, True, False, True) + if save_results_file is not None: + # Save results if needed + assert isinstance(save_results_file, str) and len(save_results_file) > 0 + with open(save_results_file, 'w', encoding='utf-8') as out_f: + out_f.write(str(result)) + return result + + +# ======================================================================== + +if __name__ == '__main__': + if len(sys.argv) < 2: + raise Exception('(!) Missing input argument: name of the configuration INI file.') + conf_file = sys.argv[1] + subexp = None + if len(sys.argv) > 2: + subexp = sys.argv[2] + train_models_main( conf_file, subexp=subexp ) \ No newline at end of file diff --git a/03b_optimize_malt.py b/03b_optimize_malt.py new file mode 100644 index 00000000..979e4d87 --- /dev/null +++ b/03b_optimize_malt.py @@ -0,0 +1,143 @@ +# +# Runs MaltOptimizer according to given configuration. +# This is an optimization step before training MaltParser. +# +# Requirements: +# Perl and Linux command line +# Python 2.7 (because optimizer uses some legacy scripts) +# Maltparser from https://maltparser.org/index.html +# MaltOptimizer from http://nil.fdi.ucm.es/maltoptimizer/ +# input conllu files without ANY sentence metadata (otherwise phase 1 will hang) +# + +from datetime import datetime +import subprocess +import os, os.path +import sys, re + +if not (sys.version_info[0] == 2 and sys.version_info[1] == 7): + raise Exception('(!) Unexpected Python version. MaltOptimizer script is only runnable with Python 2.7.') + +import ConfigParser # specific to Python 2.7 + +# Change to local paths & files, if required +maltoptimizer_dir = 'MaltOptimizer-1.0.3' +maltparser_jar = 'maltparser-1.9.2.jar' +maltoptimizer_jar = 'MaltOptimizer.jar' + +def run_maltoptimizer_main( conf_file, verbose=True ): + ''' + Runs maltoptimizer which provides feature selection before MaltParser training. + Settings/parameters will be read from the given `conf_file`. + Executes sections in the configuration starting with 'maltoptimize_'. + ''' + # Parse configuration file + config = ConfigParser.ConfigParser() + if conf_file is None or not os.path.exists(conf_file): + raise Exception("Config file {} does not exist".format(conf_file)) + if len(config.read(conf_file)) != 1: + raise ValueError("File {} is not accessible or is not in valid INI format".format(conf_file)) + start = datetime.now() + section_found = False + for section in config.sections(): + if section.startswith('maltoptimize_'): + # input_files -- one file or a list of files (with full paths) separated by ; + if not config.has_option(section, 'input_files'): + raise ValueError('Error in %s: section %s is missing "input_files" parameter.' % (conf_file, section) ) + input_files = config.get(section, 'input_files') + if ',' in input_files: + input_files = [fname.strip() for fname in input_files.split(',')] + elif ';' in input_files: + input_files = [fname.strip() for fname in input_files.split(';')] + else: + input_files = [input_files] + # output_dir -- dir where to but finalOptionsFile.xml & [featureFile.xml] + if not config.has_option(section, 'output_dir'): + raise ValueError('Error in %s: section %s is missing "output_dir" parameter.' % (conf_file, section) ) + output_dir = config.get(section, 'output_dir') + conll_file_pat = None + conll_file_regexp = None + # Customize sub-experiment pattern (if required) + if config.has_option(section, 'conll_file_pat'): + conll_file_pat = config.get(section, 'conll_file_pat') + # Convert file pattern to regular experssion + if not isinstance(conll_file_pat, basestring): + raise TypeError('conll_file_pat must be a string') + try: + conll_file_regexp = re.compile(conll_file_pat) + except Exception as err: + raise ValueError('Unable to convert {!r} to regexp'.format(conll_file_pat)) + if 'exp' not in conll_file_regexp.groupindex: + raise ValueError('Regexp {!r} is missing named group "exp"'.format(conll_file_pat)) + # Run optimizer on all input files + for input_file in input_files: + sub_exp='' + if conll_file_regexp is not None: + m = conll_file_regexp.match( input_file ) + if m: + sub_exp = m.group('exp') + else: + raise ValueError('Input file {!r} does not match pattern {!r}').format(input_file, conll_file_pat) + optimize_maltparser(input_file, output_dir=output_dir, sub_exp=sub_exp) + section_found = True + if section_found: + print('Total processing time: %s' % (datetime.now()-start)) + else: + print('No section starting with "maltoptimize_" in %s.' % (conf_file)) + +def optimize_maltparser(input_conll_file, output_dir=None, sub_exp=''): + ''' + Runs MaltOptimizer.jar on given input_conll_file (dev dataset). + + See also: + https://github.com/estnltk/maltparser_training + https://github.com/estnltk/syntax_experiments/blob/devel/03_create_training_testing_data/MaltOptimizer-1.0.3/optimize_maltparser.py + ''' + global maltoptimizer_dir, maltparser_jar, maltoptimizer_jar + + # Make input file path absolute + # (otherwise maltparser fails to load the file) + if input_conll_file != os.path.abspath(input_conll_file): + input_conll_file = os.path.abspath(input_conll_file) + + phase1 = 'java -jar %s -p 1 -m %s -c %s' % (maltoptimizer_jar, maltparser_jar, input_conll_file) + subprocess.call(phase1, shell=True, cwd=maltoptimizer_dir) + + phase2 = 'java -jar %s -p 2 -m %s -c %s' % (maltoptimizer_jar, maltparser_jar, input_conll_file) + subprocess.call(phase2, shell=True, cwd=maltoptimizer_dir) + + phase3 = 'java -jar %s -p 3 -m %s -c %s' % (maltoptimizer_jar, maltparser_jar, input_conll_file) + subprocess.call(phase3, shell=True, cwd=maltoptimizer_dir) + + phase3_optFile = os.path.join(maltoptimizer_dir, 'phase3_optFile.txt') + with open(phase3_optFile, 'r' ) as f: + lines = [l for l in f.read().split('\n') if len(l) > 0] + feature_model = lines[-1].split(':')[-1] + + if output_dir is not None: + # move the files to correct place: + if not os.path.exists(output_dir): + os.makedirs(output_dir) + os.rename(os.path.join(maltoptimizer_dir, 'finalOptionsFile.xml'), + os.path.join(output_dir, 'finalOptionsFile%s.xml' % sub_exp)) + if os.path.isfile(os.path.join(maltoptimizer_dir, feature_model)): + os.rename(os.path.join(maltoptimizer_dir, feature_model), + os.path.join(output_dir, 'featureFile%s.xml' % sub_exp)) + + +if __name__ == '__main__': + # First, check that required folders and jar files are present + if not os.path.isdir(maltoptimizer_dir): + raise Exception( ('Missing directory: \%s. Please get MaltOptimizer from: http://nil.fdi.ucm.es/maltoptimizer/') % (maltoptimizer_dir) ) + malt_dir_files = list(os.listdir(maltoptimizer_dir)) + if maltparser_jar not in malt_dir_files: + jar_path = os.path.join(maltoptimizer_dir, maltparser_jar) + raise Exception( ('Missing jar file: \%s. Please get MaltParser from: https://maltparser.org/index.html') % (jar_path) ) + if maltoptimizer_jar not in malt_dir_files: + jar_path = os.path.join(maltoptimizer_dir, maltoptimizer_jar) + raise Exception( ('Missing jar file: \%s. Please get MaltOptimizer from: http://nil.fdi.ucm.es/maltoptimizer/') % (jar_path) ) + # Get parameters from command line + if len(sys.argv) < 2: + raise Exception('(!) Missing input argument: name of the configuration INI file.') + conf_file = sys.argv[1] + run_maltoptimizer_main( conf_file, verbose=True ) diff --git a/03c_train_malt_udpipe.py b/03c_train_malt_udpipe.py new file mode 100644 index 00000000..00ad9bd8 --- /dev/null +++ b/03c_train_malt_udpipe.py @@ -0,0 +1,407 @@ +# +# Trains MaltParser/UDPipe1 models according to experiment settings. +# Supported settings: +# * full_data +# * multi_experiment (general) +# * crossvalidation +# * half_data +# * smaller_data +# +import subprocess +import os, os.path +import sys, re +import pkgutil +import configparser + +from conllu import parse_incr + +# Change to local paths & files, if required +DEFAULT_MALTPARSER_DIR = 'MaltOptimizer-1.0.3' +DEFAULT_MALTPARSER_JAR = 'maltparser-1.9.2.jar' +#DEFAULT_UDPIPE_DIR = 'udpipe-1.2.0-bin\\bin-win64' +DEFAULT_UDPIPE_DIR = 'udpipe-1.2.0-bin/bin-linux64' + +def train_malt_udpipe_main( conf_file, subexp=None, dry_run=False ): + ''' + Trains MaltParser/UDPipe-1 models based on the configuration. + Settings/parameters of the training will be read from the given + `conf_file`. + Executes sections in the configuration starting with prefix + 'train_malt_' and 'train_udpipe1_'. + + Optinally, if `subexp` is defined, then trains only that + sub-experiment and skips all other sub-experiments (in + crossvalidation, smaller_data and half_data experiments). + ''' + # Parse configuration file + config = configparser.ConfigParser() + if conf_file is None or not os.path.exists(conf_file): + raise FileNotFoundError("Config file {} does not exist".format(conf_file)) + if len(config.read(conf_file)) != 1: + raise ValueError("File {} is not accessible or is not in valid INI format".format(conf_file)) + section_found = False + for section in config.sections(): + if section.startswith('train_malt_') or section.startswith('train_udpipe1_'): + parser = 'maltparser' if section.startswith('train_malt_') else 'udpipe1' + section_found = True + subexp_str = '' if subexp is None else f' ({subexp})' + print(f'Running {section}{subexp_str} ...') + experiment_type = config[section].get('experiment_type', 'full_data') + experiment_type_clean = (experiment_type.strip()).lower() + if experiment_type_clean not in ['full_data', 'crossvalidation', 'half_data', 'smaller_data', 'multi_experiment']: + raise ValueError('(!) Unexpected experiment_type value: {!r}'.format(experiment_type)) + if experiment_type_clean == 'full_data': + # ------------------------------------------ + # 'full_data' + # ------------------------------------------ + # train_file with path + if not config.has_option(section, 'train_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "train_file" parameter.') + train_file = config[section]['train_file'] + if not os.path.isfile(train_file): + raise FileNotFoundError(f'Error in {conf_file}: invalid "train_file" value {train_file!r} in {section!r}.') + # output_dir + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_model_dir = config[section]['output_dir'] + # other parameters + dry_run = config[section].getboolean('dry_run', dry_run) + default_model = 'model.mco' if parser == 'maltparser' else 'model.udpipe' + output_model_file = config[section].get('model_file', default_model) + # MaltParser options + final_options_file = config[section].get('final_options_file', None) + feature_model_file = config[section].get('feature_model_file', None) + maltparser_dir = config[section].get('maltparser_dir', DEFAULT_MALTPARSER_DIR) + maltparser_jar = config[section].get('maltparser_jar', DEFAULT_MALTPARSER_JAR) + # UDPipe-1 options + create_embeddings_file = config[section].get('create_embeddings_file', None) + parser_options = config[section].get('parser_options', None) + udpipe_dir = config[section].get('udpipe_dir', DEFAULT_UDPIPE_DIR) + if not dry_run: + if parser == 'maltparser': + train_maltparser(output_model_file, train_file, output_dir=output_model_dir, + final_options_file=final_options_file, + feature_model_file=feature_model_file, + maltparser_dir=maltparser_dir, + maltparser_jar=maltparser_jar) + elif parser == 'udpipe1': + train_udpipe1(output_model_file, train_file, output_model_dir, + parser_options=parser_options, + create_embeddings_file=create_embeddings_file, + udpipe_dir=udpipe_dir) + else: + raise Exception(f'Unexpected parser name: {parser!r}') + elif experiment_type_clean in ['crossvalidation', 'half_data', 'smaller_data', 'multi_experiment']: + # ------------------------------------------ + # 'multi_experiment' (general) + # 'crossvalidation' + # 'half_data' + # 'smaller_data' + # ------------------------------------------ + # input_dir + if not config.has_option(section, 'input_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "input_dir" parameter.') + input_dir = config[section]['input_dir'] + if not os.path.isdir(input_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "input_dir" value {input_dir!r} in {section!r}.') + # output_dir + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_model_dir = config[section]['output_dir'] + # Common options + dry_run = config[section].getboolean('dry_run', dry_run) + train_file_pat = r'(?P\d+)_train.conllu' + train_file_re = None + if config.has_option(section, 'train_file_pat'): + train_file_pat = config[section]['train_file_pat'] + train_file_re = _create_regexp_pattern(train_file_pat, 'train_file_pat') + # MaltParser options + maltparser_dir = config[section].get('maltparser_dir', DEFAULT_MALTPARSER_DIR) + maltparser_jar = config[section].get('maltparser_jar', DEFAULT_MALTPARSER_JAR) + feature_files_dir = config[section].get('feature_files_dir', None) + final_options_file_pat = r'finalOptionsFile(?P\S+)\.xml' + feature_model_file_pat = r'featureFile(?P\S+)\.xml' + final_options_file_re = None + feature_model_file_re = None + if config.has_option(section, 'final_options_file_pat'): + final_options_file_pat = config[section]['final_options_file_pat'] + if config.has_option(section, 'feature_model_file_pat'): + feature_model_file_pat = config[section]['feature_model_file_pat'] + final_options_file_re = _create_regexp_pattern( final_options_file_pat, + 'final_options_file_pat') + feature_model_file_re = _create_regexp_pattern( feature_model_file_pat, + 'feature_model_file_pat') + all_feature_files = [] + if feature_files_dir is not None and os.path.isdir(feature_files_dir): + all_feature_files = [fname for fname in os.listdir(feature_files_dir)] + # UDPipe-1 options + create_embeddings_file = config[section].get('create_embeddings_file', None) + parser_options = config[section].get('parser_options', None) + udpipe_dir = config[section].get('udpipe_dir', DEFAULT_UDPIPE_DIR) + # Iterate over input files and train + for in_fname in os.listdir(input_dir): + if in_fname.endswith('.conllu'): + m = train_file_re.match(in_fname) + if m: + # Candidate for a training file + train_file = os.path.join(input_dir, in_fname) + cur_subexp = m.group('exp') + if subexp is not None: + if cur_subexp != subexp: + continue + if parser == 'maltparser': + output_model_file = f'model_{cur_subexp}.mco' + else: + output_model_file = f'model_{cur_subexp}.udpipe' + # Fetch Maltparser feature files + final_options_file = None + feature_model_file = None + # Try to find feature selection files (if any provided) + if len(all_feature_files) > 0: + cur_subexp_lstrip = cur_subexp.lstrip('0') + for feats_file in all_feature_files: + f1 = final_options_file_re.match(feats_file) + f2 = feature_model_file_re.match(feats_file) + if f1 and (f1.group('exp') == cur_subexp or \ + f1.group('exp') == cur_subexp_lstrip): + final_options_file = os.path.join(feature_files_dir, + feats_file) + if f2 and (f2.group('exp') == cur_subexp or \ + f2.group('exp') == cur_subexp_lstrip): + feature_model_file = os.path.join(feature_files_dir, + feats_file) + if final_options_file is None: + raise Exception(f'Unable to find final_options_file for experiment {cur_subexp!r}') + if feature_model_file is None: + raise Exception(f'Unable to find feature_model_file for experiment {cur_subexp!r}') + # Launch training + if not dry_run: + if parser == 'maltparser': + print(f' Training Maltparser on {train_file} (exp: {cur_subexp}) ...') + train_maltparser(output_model_file, train_file, + output_dir=output_model_dir, + final_options_file=final_options_file, + feature_model_file=feature_model_file, + maltparser_dir=maltparser_dir, + maltparser_jar=maltparser_jar) + elif parser == 'udpipe1': + cur_embeddings_file = create_embeddings_file + if cur_embeddings_file is not None: + cur_embeddings_file = f'{cur_subexp}_{cur_embeddings_file}' + train_udpipe1(output_model_file, train_file, output_model_dir, + parser_options=parser_options, + create_embeddings_file=cur_embeddings_file, + udpipe_dir=udpipe_dir) + else: + raise Exception(f'Unexpected parser name: {parser!r}') + if not section_found: + print(f'No section starting with "train_malt_" or "train_udpipe1_" in {conf_file}.') + +def _create_regexp_pattern(fpattern, pattern_var_name): + # Convert file pattern to regular experssion + if not isinstance(fpattern, str): + raise TypeError(f'{pattern_var_name} must be a string') + regexp = None + try: + regexp = re.compile(fpattern) + except Exception as err: + raise ValueError(f'Unable to convert {fpattern!r} to regexp') from err + if 'exp' not in regexp.groupindex: + raise ValueError(f'Regexp {fpattern!r} is missing named group "exp"') + return regexp + +# =============================================================== +# Train MaltParser +# =============================================================== + +def check_maltparser_requirements(maltparser_dir=DEFAULT_MALTPARSER_DIR, + maltparser_jar=DEFAULT_MALTPARSER_JAR): + ''' + Check that MaltParser's required folders and jar files are present. + Raises an expection if anything is missing. + ''' + if not os.path.isdir(maltparser_dir): + raise Exception( ('Missing directory: \%s. Please get MaltParser from: https://maltparser.org/index.html') % (maltparser_dir) ) + malt_dir_files = list(os.listdir(maltparser_dir)) + if maltparser_jar not in malt_dir_files: + jar_path = os.path.join(maltparser_dir, maltparser_jar) + raise Exception( ('Missing jar file: \%s. Please get MaltParser from: https://maltparser.org/index.html') % (jar_path) ) + if 'lib' not in malt_dir_files: + lib_path = os.path.join(maltparser_dir, 'lib') + raise Exception( ('Missing java libraries dir: \%s. Please get MaltParser from: https://maltparser.org/index.html') % (lib_path) ) + return True + +def train_maltparser(output_model, train_corpus, output_dir=None, final_options_file=None, feature_model_file=None, + maltparser_dir=DEFAULT_MALTPARSER_DIR, maltparser_jar=DEFAULT_MALTPARSER_JAR): + ''' + Trains MaltParser on train_corpus, creates output_model and saves into output_dir. + Optionally, uses final_options_file and feature_model_file for feature selection. + ''' + check_maltparser_requirements(maltparser_dir=maltparser_dir, maltparser_jar=maltparser_jar) + # Make input file paths absolute + if train_corpus != os.path.abspath(train_corpus): + train_corpus = os.path.abspath(train_corpus) + if final_options_file is not None and final_options_file != os.path.abspath(final_options_file): + final_options_file = os.path.abspath(final_options_file) + if feature_model_file is not None and feature_model_file != os.path.abspath(feature_model_file): + feature_model_file = os.path.abspath(feature_model_file) + # Construct command + if final_options_file is not None and feature_model_file is not None: + train_command = \ + ('java -Xmx6g -jar {jar} -i {train_corpus} -c {output_model} -m learn -f {final_options_file} -F {feature_model_file}').\ + format(jar=maltparser_jar, output_model=output_model, train_corpus=train_corpus, + final_options_file=final_options_file, feature_model_file=feature_model_file) + else: + train_command = \ + ('java -Xmx6g -jar {jar} -i {train_corpus} -c {output_model} -m learn').\ + format(jar=maltparser_jar, train_corpus=train_corpus, output_model=output_model) + # Execute training + subprocess.call(train_command, shell=True, cwd=maltparser_dir) + if output_dir is not None: + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + # Remove old file + if os.path.exists(os.path.join(output_dir, output_model)): + os.remove(os.path.join(output_dir, output_model)) + # Move model file to output dir + os.rename(os.path.join(maltparser_dir, output_model), + os.path.join(output_dir, output_model)) + + +# =============================================================== +# Train UDPipe-1 (preprocessing) +# =============================================================== + +def is_gensim_available(): + """ + Checks if the package gensim is available. + This is required for creating word2vec embeddings for gensim. + """ + return pkgutil.find_loader('gensim') is not None + +def load_conllu_tokens_sentences(input_conllu): + ''' + Loads conllu file's textual content. + Returns a list of lists: sentences of tokens. + ''' + sentences = [] + with open(input_conllu, 'r', encoding='utf-8') as conllu_file: + for sentence in parse_incr(conllu_file): + sentences.append([]) + for i, token in enumerate(sentence): + sentences[-1].append(token['form']) + return sentences + +def create_word2vec_model(input_conllu, output_path): + ''' + Trains word2vec embeddings file for UDPipe-1. + Saves text format model to output_path. + ''' + if not is_gensim_available(): + raise Exception('(!) Package gensim is required for pre-training embeddings for udpipe. '+\ + 'Get the package from here: https://radimrehurek.com/gensim/ ') + import gensim.models + sentences = load_conllu_tokens_sentences(input_conllu) + # Following pre-training settings mentioned here: + # https://ufal.mff.cuni.cz/udpipe/1/users-manual#udpipe_training_parser_embeddings + model = gensim.models.Word2Vec( + sentences=sentences, + min_count=2, + vector_size=50, + window=10, + hs=0, + sg=1, # skip-gram + sample=1e-3, + epochs=15, + negative=5 + ) + model.wv.save_word2vec_format(output_path, binary=False) + +# =============================================================== +# Train UDPipe-1 +# =============================================================== + +def check_if_udpipe_is_in_path(udpipe_cmd='udpipe'): + ''' Checks whether given udpipe is in system's PATH. Returns True, there is + a file with given name (udpipe_cmd) in the PATH, otherwise returns False; + The idea borrows from: http://stackoverflow.com/a/377028 + ''' + if os.getenv("PATH") == None: + return False + for path in os.environ["PATH"].split(os.pathsep): + path1 = path.strip('"') + file1 = os.path.join(path1, udpipe_cmd) + if os.path.isfile(file1) or os.path.isfile(file1 + '.exe'): + return True + return False + +def train_udpipe1(output_model, train_corpus, output_dir, parser_options=None, create_embeddings_file=None, verbose=True, + udpipe_dir=DEFAULT_UDPIPE_DIR): + ''' + Trains UDPipe-1 on train_corpus, creates output_model and saves into output_dir. + List of parser options can be provided via string parser_options. + If create_embeddings_file is not None, then creates word2vec form embeddings from train_corpus + and saves into file named create_embeddings_file. + + Note: if parser_options is not provided, then UDPipe's default training options are: + Parser transition options: system=projective, oracle=dynamic, structured_interval=8, single_root=1 + Parser uses lemmas/upos/xpos/feats: from gold data + Parser embeddings options: upostag=20, feats=20, xpostag=0, form=50, lemma=0, deprel=20 + form mincount=2, precomputed form embeddings=none + lemma mincount=2, precomputed lemma embeddings=none + Parser network options: iterations=10, hidden_layer=200, batch_size=10, + learning_rate=0.0200, learning_rate_final=0.0010, l2=0.5000, early_stopping=0 + ''' + udpipe_dir_exists = udpipe_dir is not None and os.path.isdir(udpipe_dir) + udpipe_is_in_path = check_if_udpipe_is_in_path() + if not udpipe_dir_exists and not udpipe_is_in_path: + raise Exception('(!) Could not find UDPipe executable. '+\ + 'Please make sure udpipe is installed and available in system PATH. '+\ + 'Or, alternatively, provide location of UDPipe via variable udpipe_dir. '+\ + 'You can download udpipe from: https://ufal.mff.cuni.cz/udpipe/1/') + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + embeddings_path = None + if create_embeddings_file is not None: + embeddings_path = os.path.join(output_dir, create_embeddings_file) + if verbose: + print(f'Creating word2vec embeddings file {create_embeddings_file} ...') + create_word2vec_model(train_corpus, embeddings_path) + output_model_path = os.path.join(output_dir, output_model) + if parser_options is None: + parser_options = 'use_gold_tags=1' + else: + if 'use_gold_tags=1' not in parser_options: + parser_options += ';use_gold_tags=1' + if embeddings_path is not None: + if 'embedding_form_file=' not in parser_options: + parser_options += f';embedding_form_file={embeddings_path}' + else: + # Updated from embeddings file path + parser_options = re.sub(r';embedding_form_file=([^; ])+', + f';embedding_form_file={embeddings_path}', + parser_options) + if verbose: + print(f' Training UDPipe-1 on {train_corpus!r} with settings {parser_options!r} ...') + udpipe_cmd = 'udpipe' + if udpipe_dir_exists: + udpipe_cmd = os.path.join(udpipe_dir, udpipe_cmd) + # Linux shell note: parser_options must be surrounded by ' and ', otherwise udpipe is unable + # to parse them and hangs while waiting for training input. + train_command = \ + ('{udpipe_cmd} --train {output_model_path} --tokenizer=none --tagger=none --parser={parser_options!r} {train_corpus}').\ + format(udpipe_cmd=udpipe_cmd, output_model_path=output_model_path, parser_options=parser_options, + train_corpus=train_corpus) + # Execute training + subprocess.call(train_command, shell=True) + + + +if __name__ == '__main__': + if len(sys.argv) < 2: + raise Exception('(!) Missing input argument: name of the configuration INI file.') + subexp=None + if len(sys.argv) > 2: + subexp = sys.argv[2] + # Try to execute input file as configuration + train_malt_udpipe_main( sys.argv[1], subexp=subexp, dry_run=False ) \ No newline at end of file diff --git a/04_predict_stanza.py b/04_predict_stanza.py new file mode 100644 index 00000000..22b18b47 --- /dev/null +++ b/04_predict_stanza.py @@ -0,0 +1,748 @@ +# +# Applies models to get predictions. +# Supported models: +# * stanza syntax (depparse) +# * stanza syntax ensemble (depparse) +# Implemented settings: +# * full_data +# * multi_experiment (general) +# * crossvalidation +# * half_data +# * smaller_data +# +import os, os.path +import re +import sys +from datetime import datetime +import warnings + +from conllu import parse_incr +from conllu.serializer import serialize_field + +from stanza import Pipeline +from stanza.models.common.doc import Document +from stanza.utils.conll import CoNLL + +import configparser + +# =============================================================== +# Run trained models / get predictions (MAIN) +# =============================================================== + +def run_models_main( conf_file, subexp=None, dry_run=False ): + ''' + Runs model(s) based on the configuration. + Settings/parameters of running model(s) will be read from the + given `conf_file`. + Executes sections in the configuration starting with prefix + 'predict_stanza_'. + + Optinally, if `subexp` is defined, then runs only that + sub-experiment and skips all other sub-experiments (in + crossvalidation, smaller_data and half_data experiments). + ''' + # Parse configuration file + config = configparser.ConfigParser() + if conf_file is None or not os.path.exists(conf_file): + raise FileNotFoundError("Config file {} does not exist".format(conf_file)) + if len(config.read(conf_file)) != 1: + raise ValueError("File {} is not accessible or is not in valid INI format".format(conf_file)) + section_found = False + for section in config.sections(): + # ------------------------------------------ + # s t a n z a + # ------------------------------------------ + if section.startswith('predict_stanza_'): + section_found = True + subexp_str = '' if subexp is None else f' ({subexp})' + print(f'Running {section}{subexp_str} ...') + experiment_type = config[section].get('experiment_type', 'full_data') + experiment_type_clean = (experiment_type.strip()).lower() + if experiment_type_clean not in ['full_data', 'crossvalidation', 'half_data', 'smaller_data', 'multi_experiment']: + raise ValueError('(!) Unexpected experiment_type value: {!r}'.format(experiment_type)) + if experiment_type_clean == 'full_data': + # ------------------------------------------ + # 'full_data' + # ------------------------------------------ + # train_file with path + if not config.has_option(section, 'train_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "train_file" parameter.') + train_file = config[section]['train_file'] + if not os.path.isfile(train_file): + raise FileNotFoundError(f'Error in {conf_file}: invalid "train_file" value {train_file!r} in {section!r}.') + # test_file with path + if not config.has_option(section, 'test_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "test_file" parameter.') + test_file = config[section]['test_file'] + if not os.path.isfile(test_file): + raise FileNotFoundError(f'Error in {conf_file}: invalid "test_file" value {test_file!r} in {section!r}.') + # output_dir + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_dir = config[section]['output_dir'] + # 1) use_estnltk=True and use_ensemble=False -- run model with estnltk's preprocessing and StanzaSyntaxTagger; + # 2) use_estnltk=True and use_ensemble=True -- run model with estnltk's preprocessing and StanzaSyntaxEnsembleTagger + # and use_majority_voting=False (with aggregation_algorithm="las_coherence"); + # 3) use_estnltk=True and use_ensemble=True -- run model with estnltk's preprocessing and StanzaSyntaxEnsembleTagger + # and use_majority_voting=True (with aggregation_algorithm="majority_voting"); + # 4) use_estnltk=False -- run model on input feats loaded from conllu file; + use_estnltk = config[section].getboolean('use_estnltk', False) + use_ensemble = config[section].getboolean('use_ensemble', False) + use_majority_voting = config[section].getboolean('use_majority_voting', False) + if use_ensemble and not use_estnltk: + raise ValueError(f'Error in {conf_file}: section {section!r} conflicting '+\ + 'configuration use_estnltk=False and use_ensemble=True. '+\ + 'Cannot use ensemble tagger without estnltk.' ) + if use_majority_voting and not use_ensemble: + raise ValueError(f'Error in {conf_file}: section {section!r} conflicting '+\ + 'configuration use_ensemble=False and use_majority_voting=True. '+\ + 'Cannot use majority_voting without ensemble.' ) + default_tagger_path = 'estnltk_neural.taggers.StanzaSyntaxTagger' if not use_ensemble else \ + 'estnltk_neural.taggers.StanzaSyntaxEnsembleTagger' + tagger_path = config[section].get('tagger_path', default_tagger_path) + dry_run = config[section].getboolean('dry_run', dry_run) + use_gpu = config[section].getboolean('use_gpu', False) + # seed for randomly picking one analysis from ambiguous morph analyses + seed = config[section].getint('seed', 43) + # seed for randomly choosing one dependency result from results with maximum scores + scores_seed = config[section].getint('scores_seed', 3) + output_prefix = config[section].get('output_file_prefix', 'predicted_') + lang = config[section].get('lang', 'et') + # Get model file or files + model_file = None + model_files = [] + if use_ensemble: + # predict with ensemble: get models_dir + if not config.has_option(section, 'models_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "models_dir" parameter.') + models_dir = config[section]['models_dir'] + if not os.path.isdir(models_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "models_dir" value {models_dir!r} in {section!r}.') + # collect all model files from the directory + model_file_name_pattern = re.compile( "^model_(.+)\.pt$") + for fname in os.listdir(models_dir): + if model_file_name_pattern.match(fname): + model_files.append( os.path.join(models_dir, fname) ) + if len(model_files) == 0: + raise Exception( f'Error in {conf_file}: section {section!r}: Did not find any model files for '+\ + 'the ensemble tagger from models_dir={models_dir!r}.' ) + else: + # predict with a single model: get model file with path + if not config.has_option(section, 'model_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "model_file" parameter.') + model_file = config[section]['model_file'] + if not os.path.isfile(model_file): + raise FileNotFoundError(f'Error in {conf_file}: invalid "model_file" value {model_file!r} in {section!r}.') + # Run predictions + if not dry_run: + start_time = datetime.now() + # Predict on train data + train_output = os.path.join(output_dir, f'{output_prefix}train.conllu') + if use_estnltk: + if not config.has_option(section, 'morph_layer'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "morph_layer" parameter.') + morph_layer = config[section]['morph_layer'] + if not use_ensemble: + # run StanzaSyntaxTagger + predict_with_stanza_tagger(train_file, morph_layer, model_file, train_output, + tagger_path=tagger_path, seed=seed, lang=lang, + use_gpu=use_gpu) + else: + # run StanzaSyntaxEnsembleTagger + predict_with_stanza_ensemble_tagger(train_file, morph_layer, model_files, train_output, + tagger_path=tagger_path, seed=seed, lang=lang, + use_majority_voting=use_majority_voting, + use_gpu=use_gpu, scores_seed=scores_seed) + else: + # run vanilla stanza + predict_with_stanza(train_file, model_file, train_output, lang=lang, use_gpu=use_gpu) + # Predict on test data + test_output = os.path.join(output_dir, f'{output_prefix}test.conllu') + if use_estnltk: + if not config.has_option(section, 'morph_layer'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "morph_layer" parameter.') + morph_layer = config[section]['morph_layer'] + if not use_ensemble: + # run StanzaSyntaxTagger + predict_with_stanza_tagger(test_file, morph_layer, model_file, test_output, + tagger_path=tagger_path, seed=seed, lang=lang, + use_gpu=use_gpu) + else: + # run StanzaSyntaxEnsembleTagger + predict_with_stanza_ensemble_tagger(test_file, morph_layer, model_files, test_output, + tagger_path=tagger_path, seed=seed, lang=lang, + use_majority_voting=use_majority_voting, + use_gpu=use_gpu, scores_seed=scores_seed) + else: + # run vanilla stanza + predict_with_stanza(test_file, model_file, test_output, lang=lang, use_gpu=use_gpu) + print(f'Total time elapsed: {datetime.now()-start_time}') + elif experiment_type_clean in ['crossvalidation', 'half_data', 'smaller_data', 'multi_experiment']: + # ------------------------------------------ + # 'multi_experiment' (general) + # 'crossvalidation' + # 'half_data' + # 'smaller_data' + # ------------------------------------------ + # input_dir (training conllu files) + if not config.has_option(section, 'input_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "input_dir" parameter.') + input_dir = config[section]['input_dir'] + if not os.path.isdir(input_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "input_dir" value {input_dir!r} in {section!r}.') + # models_dir + if not config.has_option(section, 'models_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "models_dir" parameter.') + models_dir = config[section]['models_dir'] + if not os.path.isdir(models_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "models_dir" value {models_dir!r} in {section!r}.') + # test_file with full path, or a pattern for finding test file from input_dir + if not config.has_option(section, 'test_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "test_file" parameter.') + test_file = config[section]['test_file'] + test_file_is_pattern = config[section].getboolean('test_file_is_pattern', False) + if not test_file_is_pattern and not os.path.isfile(test_file): + raise FileNotFoundError(f'Error in {conf_file}: invalid "test_file" value {test_file!r} in {section!r}.') + # output_dir + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_dir = config[section]['output_dir'] + # use_estnltk=True -- run model with estnltk's preprocessing and StanzaSyntaxTagger; + # use_estnltk=False -- run model on input feats loaded from conllu file; + use_estnltk = config[section].getboolean('use_estnltk', False) # use estnltk's StanzaSyntaxTagger + if config[section].getboolean('use_ensemble', False): + # Ensemble models are not supported here + raise NotImplementedError(f'Error in {conf_file}: section {section!r}: experiment_type='+\ + f'{experiment_type_clean!r} does not support ensemble models.') + tagger_path = config[section].get('tagger_path', 'estnltk_neural.taggers.StanzaSyntaxTagger') + dry_run = config[section].getboolean('dry_run', dry_run) + use_gpu = config[section].getboolean('use_gpu', False) + # skip_train: do not predict on train files + skip_train = config[section].getboolean('skip_train', False) + # test_matrix prediction mode: run all models on all test files + test_matrix = config[section].getboolean('test_matrix', False) + if test_matrix and not test_file_is_pattern: + raise ValueError('(!) test_matrix can only be used if test file name is a regular expression') + output_prefix = config[section].get('output_file_prefix', 'predicted_') + seed = config[section].getint('seed', 43) + lang = config[section].get('lang', 'et') + morph_layer = None + if use_estnltk: + if not config.has_option(section, 'morph_layer'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "morph_layer" parameter.') + morph_layer = config[section]['morph_layer'] + # Patterns for capturing names of training sub-experiment files + train_file_pat = r'(?P\d+)_train_all.conllu' + # Override sub-experiment patterns (if required) + if config.has_option(section, 'train_file_pat'): + train_file_pat = config[section]['train_file_pat'] + parser = 'stanza' + # Run models + bulk_predict( input_dir, models_dir, train_file_pat, test_file, + output_dir, output_file_prefix=output_prefix, subexp=subexp, + test_file_is_pattern=test_file_is_pattern, parser=parser, + use_estnltk=use_estnltk, morph_layer=morph_layer, seed=seed, + tagger_path=tagger_path, lang=lang, skip_train=skip_train, + test_matrix=test_matrix, use_gpu=use_gpu, dry_run=dry_run ) + if not section_found: + print(f'No section starting with "predict_stanza_" in {conf_file}.') + + +def bulk_predict( data_folder, models_folder, train_file_pattern, test_file_path, + output_path, output_file_prefix='predicted_', subexp=None, + test_file_is_pattern=False, parser='stanza', use_estnltk=False, + morph_layer=None, seed=None, tagger_path=None, lang='et', + skip_train=False, test_matrix=False, use_gpu=False, dry_run=False ): + ''' + Runs models of multiple sub-experiments on (train/test) files from `data_folder`. + Outputs prediction conllu files to `output_path`. + + Parameter `train_file_pattern` must be a string compilable into regexp pattern + that can be used to detect training data sets of all sub-experiments. + This patterns must have the named group 'exp', indicating part of the pattern + matching sub-experiment name. + Optinally, if test_file_is_pattern=True, then `test_file_path` is compiled to + a regular expression (must have named group 'exp') and used to find a test file + corresponding to train file from `data_folder`. + + By default, each model is evaluated on a train file, and on either a single + test file or a test file corresponding to the training file (of the sub- + experiment). + If skip_train==True, then no evaluation is done on train files. Note, however, + that even with skip_train==True, `train_file_pattern` must be provided, as it + is required for determining sub-experiment names and finding corresponding + models. + If test_matrix==True, then each model is evaluated on all test files. + The test_matrix mode only works with test_file_is_pattern=True option. + + Use parameter `subexp` to restrict predictions only to a single sub-experiment + instead of performing all sub-experiments. + This is useful when multiple instances of the Python are launched for + parallelization. + ''' + # Validate input arguments + supported_parsers = ['stanza'] + if not isinstance(parser, str) or parser.lower() not in supported_parsers: + raise ValueError( f'(!) Unexpected parser: {parser!r}. '+\ + f'Supported parsers: {supported_parsers!r}' ) + parser = parser.lower() + if parser == 'stanza' and tagger_path is None: + tagger_path = 'estnltk_neural.taggers.StanzaSyntaxTagger' + if not os.path.exists(data_folder) or not os.path.isdir(data_folder): + raise Exception(f'(!) Missing or invalid data_folder {data_folder!r}') + if not os.path.exists(models_folder) or not os.path.isdir(models_folder): + raise Exception(f'(!) Missing or invalid models_folder {models_folder!r}') + if use_estnltk and morph_layer is None: + raise Exception(f'(!) Unexpected None value for morph_layer with use_estnltk') + test_file_regex = None + if not test_file_is_pattern: + # Test file is always the same: should be a full path + if not os.path.exists(test_file_path) or not os.path.isfile(test_file_path): + raise Exception(f'(!) Missing or invalid test_file_path {test_file_path!r}') + else: + # Test file should be found via regexp: + # Convert test_file to regular experssion + if not isinstance(test_file_path, str): + raise TypeError('test_file_path must be a string') + try: + test_file_regex = re.compile(test_file_path) + except Exception as err: + raise ValueError(f'Unable to convert {test_file_path!r} to regexp') from err + if 'exp' not in test_file_regex.groupindex: + raise ValueError(f'Regexp {test_file_path!r} is missing named group "exp"') + if test_matrix and test_file_regex is None: + raise Exception(f'(!) test_matrix can only be used if test_file_regex is provided') + # Convert train_file_pattern to regular experssion + train_file_regex = None + if not isinstance(train_file_pattern, str): + raise TypeError('train_file_pattern must be a string') + try: + train_file_regex = re.compile(train_file_pattern) + except Exception as err: + raise ValueError(f'Unable to convert {train_file_pattern!r} to regexp') from err + if 'exp' not in train_file_regex.groupindex: + raise ValueError(f'Regexp {train_file_pattern!r} is missing named group "exp"') + # + # Collect experiment input files + # + models_folder_files = [ fname for fname in os.listdir(models_folder) ] + experiment_data = { 'train':[], 'test':[], 'models': [], 'numbers':[] } + if not test_matrix: + # ============================================================== + # Default mode: + # * run each model on its train file (if not skip_train) + # * run each model on its test file or on the global test file + # ============================================================== + for fname in sorted( os.listdir(data_folder) ): + m = train_file_regex.match(fname) + if m: + if not (fname.lower()).endswith('.conllu'): + raise Exception( f'(!) invalid file {fname}: train file '+\ + 'must have extension .conllu' ) + fpath = os.path.join(data_folder, fname) + # Training file varies, depending on the sub set of data + experiment_data['train'].append( fpath ) + no = m.group('exp') + if no not in experiment_data['numbers']: + experiment_data['numbers'].append(no) + if test_file_regex is None: + # No regexp for test file: + # Test file is always the same (global test file) + experiment_data['test'].append( test_file_path ) + else: + # Test file regexp provided: + # Find test file corresponding to train file + found_test_file = None + for fname_2 in sorted( os.listdir(data_folder) ): + m2 = test_file_regex.match(fname_2) + if m2: + no2 = m2.group('exp') + if no2 == no: + found_test_file = \ + os.path.join(data_folder, fname_2) + break + if found_test_file is not None: + experiment_data['test'].append( found_test_file ) + else: + raise Exception(f'(!) Unable to find test file corresponding '+\ + f'to train file {fname!r} from {data_folder!r}.') + # Find corresponding model from the models folder + target_model_file = f"model_{no}.pt" + model_found = False + for model_fname in models_folder_files: + if model_fname == target_model_file: + mfpath = os.path.join(models_folder, model_fname) + experiment_data['models'].append(mfpath) + model_found = True + break + if not model_found: + if not dry_run: + raise Exception(f'(!) Unable to find model {target_model_file!r} from {models_folder!r}') + else: + # Try run, emulate only, don't chk for models + experiment_data['models'].append(target_model_file) + else: + # ============================================================== + # Test matrix mode: + # * run each model on its train file (if train file is available) + # * run each model on all test files + # ============================================================== + for fname in sorted( os.listdir(data_folder) ): + m = test_file_regex.match(fname) + if m: + if not (fname.lower()).endswith('.conllu'): + raise Exception( f'(!) invalid file {fname}: test file '+\ + 'must have extension .conllu' ) + no = m.group('exp') + if no not in experiment_data['numbers']: + experiment_data['numbers'].append(no) + # Placeholder for test file to pass checks below + found_test_file = os.path.join(data_folder, fname) + experiment_data['test'].append( found_test_file ) + # Find corresponding model from the models folder + target_model_file = f"model_{no}.pt" + model_found = False + for model_fname in models_folder_files: + if model_fname == target_model_file: + mfpath = os.path.join(models_folder, model_fname) + experiment_data['models'].append(mfpath) + model_found = True + break + if not model_found: + if not dry_run: + raise Exception(f'(!) Unable to find model {target_model_file!r} from {models_folder!r}') + else: + # Try run, emulate only, don't chk for models + experiment_data['models'].append(target_model_file) + # Try to find corresponding train file (optional) + found_train_file = None + for fname_2 in sorted( os.listdir(data_folder) ): + m2 = train_file_regex.match(fname_2) + if m2: + no2 = m2.group('exp') + if no2 == no: + found_train_file = \ + os.path.join(data_folder, fname_2) + break + experiment_data['train'].append(found_train_file) + # + # Validate that we have correct numbers of experiment files + # + for subset in ['train']: + if len(experiment_data[subset]) == 0: + raise Exception(f'Unable to find any {subset} files '+\ + f'matching {train_file_pattern!r} in dir {data_folder!r}.') + if len(experiment_data[subset]) != len(experiment_data['numbers']): + no1 = len(experiment_data[subset]) + no2 = len(experiment_data['numbers']) + raise Exception(f'Number of {subset} files ({no1}) does not match '+\ + f'the number of experiments ({no2}).') + if len(experiment_data[subset]) != len(experiment_data['models']): + no1 = len(experiment_data[subset]) + no2 = len(experiment_data['models']) + raise Exception(f'Number of {subset} files ({no1}) does not match '+\ + f'the number of models ({no2}).') + if subexp is not None: + if subexp not in experiment_data['numbers']: + raise ValueError( f'(!) sub-experiment {subexp!r} not in collected '+\ + f'experiment names: {experiment_data["numbers"]}.' ) + # + # Launch experiments + # + if not dry_run: + start_time = datetime.now() + for i in range( len(experiment_data['numbers']) ): + exp_no = experiment_data['numbers'][i] + train_file = experiment_data['train'][i] + test_file = experiment_data['test'][i] + model_file = experiment_data['models'][i] + if subexp is not None and exp_no != subexp: + # Skip other experiments + continue + if parser == 'stanza': + # Predict on train data (optional, can be skipped) + if train_file is not None and not skip_train: + train_output = os.path.join(output_path, f'{output_file_prefix}train_{exp_no}.conllu') + if use_estnltk: + predict_with_stanza_tagger(train_file, morph_layer, model_file, train_output, + tagger_path=tagger_path, seed=seed, lang=lang, + use_gpu=use_gpu) + else: + predict_with_stanza(train_file, model_file, train_output, lang=lang, use_gpu=use_gpu) + # Predict on test data + if not test_matrix: + # Predict on single test file + test_output = os.path.join(output_path, f'{output_file_prefix}test_{exp_no}.conllu') + if use_estnltk: + predict_with_stanza_tagger(test_file, morph_layer, model_file, test_output, + tagger_path=tagger_path, seed=seed, lang=lang, + use_gpu=use_gpu) + else: + predict_with_stanza(test_file, model_file, test_output, lang=lang, use_gpu=use_gpu) + else: + # Predict for matrix: predict on all test files + for j in range( len(experiment_data['numbers']) ): + exp_no2 = experiment_data['numbers'][j] + test_file2 = experiment_data['test'][j] + test_output = os.path.join(output_path, \ + f'{output_file_prefix}model_{exp_no}_test_{exp_no2}.conllu') + if use_estnltk: + predict_with_stanza_tagger(test_file2, morph_layer, model_file, test_output, + tagger_path=tagger_path, seed=seed, lang=lang, + use_gpu=use_gpu) + else: + predict_with_stanza(test_file2, model_file, test_output, lang=lang, use_gpu=use_gpu) + print() + print() + print(f'Total time elapsed: {datetime.now()-start_time}') + +# ======================================================================== +# Stanza interface: run models to get depparse predictions +# Two ways: +# 1) run model with estnltk's preprocessing and StanzaSyntax(Ensemble)Tagger; +# 2) run model on input feats loaded from conllu file; +# ======================================================================== + +def create_estnltk_document( input_path, morph_layer='morph_extended', + syntax_layer='gold_syntax' ): + """ + Loads given CONLLU file as estnltk's Text object. + Preannotates text: adds tokenization layers and morph_layer + (either 'morph_extended' or 'morph_analysis'). + Returns loaded Text object. + + Requires: estnltk v1.7.2+ + + :param input_path: path to conllu file to be loaded + :param morph_layer: name of estnltk's morphological analysis layer + :param syntax_layer: name of syntactic analysis layer loaded from file + :return: estnltk Text object + """ + from estnltk import Text + from estnltk.taggers import WhiteSpaceTokensTagger + from estnltk.taggers import PretokenizedTextCompoundTokensTagger + from estnltk.converters.conll.conll_importer import conll_to_text + text_obj = conll_to_text(input_path, syntax_layer) + assert 'words' in text_obj.layers + assert 'sentences' in text_obj.layers + if 'compound_tokens' not in text_obj.layers: + (WhiteSpaceTokensTagger()).tag( text_obj ) + (PretokenizedTextCompoundTokensTagger()).tag( text_obj ) + text_obj.tag_layer( morph_layer ) + return text_obj + +def create_stanza_document(input_path): + """ + Loads sentences from given CONLLU file and creates stanza's Document. + Document will be pretagged: it contains id, text, lemma, upos, xpos, + feats values loaded from the CONLLU file, and empty values in place + of other conllu fields. + Returns loaded Document. + + :param input_path: path to conllu file to be loaded + :return: stanza Document + """ + with open(input_path, 'r', encoding='utf-8') as conllu_file: + data = [] + for tokenlist in parse_incr(conllu_file): + pretagged_sent = [] + for word in tokenlist: + if not isinstance(word['id'], int): + # Because stanza cannot handle ellipsis (considers it + # a multi-word), we leave ellipsis word out + continue + word_feats = { + 'id': word['id'], + 'text': word['form'], + 'lemma': word['lemma'], + 'upos': word['upos'], + 'xpos': word['xpos'], + 'feats': serialize_field( word['feats'] ) + } + pretagged_sent.append(word_feats) + data.append(pretagged_sent) + # create Document-obj from sentences of the pretagged file + return Document(data) + +def predict_with_stanza(input_path, model_path, output_path, lang='et', use_gpu=False): + ''' + Applies stanza's model on given input CONLLU file to get depparse predictions. + Saves predictions to output CONLLU file. + + :param input_path: path to conllu file to be annotated + :param model_path: path to depparse model to be used for making predictions + :param output_path: path to output conllu file + ''' + config = { + 'processors': 'depparse', # Comma-separated list of processors to use + 'lang': lang, # Language code for the language to build the Pipeline in + 'depparse_pretagged': True, + 'depparse_model_path': model_path, + 'download_method': 0, # NONE will not download anything + 'use_gpu': use_gpu + } + nlp = Pipeline(**config) + doc = create_stanza_document(input_path) + nlp(doc) + output_dir, output_fname = os.path.split(output_path) + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + write_stanza_doc_to_conll(doc, output_path) + +def predict_with_stanza_tagger(input_path, morph_layer, model_path, output_path, + tagger_path='estnltk_neural.taggers.StanzaSyntaxTagger', + seed=None, lang='et', use_gpu=False): + ''' + Applies estnltk's StanzaSyntaxTagger on given input CONLLU file to get depparse predictions. + Uses estnltk's preprocessing to load and re-annotate document (adds morph_layer). + Saves predictions to output CONLLU file. + + By default, imports tagger from 'estnltk_neural.taggers.StanzaSyntaxTagger', but you can + use `tagger_path` to overwrite the importing path. Use this if you've customized the tagger + (e.g. made fixes for it), and want to test it out (instead of the default version). + + Requires: estnltk v1.7.2+ + + :param input_path: path to conllu file to be annotated + :param morph_layer: name of estnltk's morphological analysis layer + :param model_path: path to depparse model to be used for making predictions + :param output_path: path to output conllu file + :param tagger_path: full import path of StanzaSyntaxTagger + :param seed: seed of the random process creating unambiguous morph analysis layer + ''' + tagger_loader = \ + create_stanza_tagger_loader( tagger_path, model_path, morph_layer, use_gpu=use_gpu, seed=seed ) + tagger = tagger_loader.tagger # Load tagger + text_obj = create_estnltk_document(input_path, morph_layer=morph_layer) + tagger.tag(text_obj) + output_dir, output_fname = os.path.split(output_path) + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + write_estnltk_text_to_conll(text_obj, tagger.output_layer, output_path) + +def predict_with_stanza_ensemble_tagger(input_path, morph_layer, model_paths, output_path, + tagger_path='estnltk_neural.taggers.StanzaSyntaxEnsembleTagger', + seed=None, scores_seed=None, use_majority_voting=False, + lang='et', use_gpu=False, verbose=True): + ''' + Applies estnltk's StanzaSyntaxEnsembleTagger on given input CONLLU file to get depparse predictions. + Uses estnltk's preprocessing to load and re-annotate document (adds morph_layer). + Saves predictions to output CONLLU file. + + By default, imports tagger from 'estnltk_neural.taggers.StanzaSyntaxEnsembleTagger', but you can + use `tagger_path` to overwrite the importing path. Use this if you've customized the tagger + (e.g. made fixes for it), and want to test it out (instead of the default version). + + Requires: estnltk v1.7.2+ + + :param input_path: path to conllu file to be annotated + :param morph_layer: name of estnltk's morphological analysis layer + :param model_path: path to depparse model to be used for making predictions + :param output_path: path to output conllu file + :param tagger_path: full import path of StanzaSyntaxEnsembleTagger + :param seed: seed of the random process creating unambiguous morph analysis layer + :param scores_seed: seed of the random process picking one parse from multiple parses with max score + :param use_majority_voting: whether StanzaSyntaxEnsembleTagger should use 'majority_voting' as the + aggregation algorithm + ''' + tagger_loader = \ + create_stanza_ensemble_tagger_loader( tagger_path, model_paths, morph_layer, + use_majority_voting=use_majority_voting, + use_gpu=use_gpu, seed=seed, scores_seed=scores_seed ) + tagger = tagger_loader.tagger # Load tagger + if verbose: + print(f'Loaded {tagger_path!r} with {len(model_paths)} models for prediction.') + text_obj = create_estnltk_document(input_path, morph_layer=morph_layer) + if verbose: + print(f'Preprocessed {input_path!r}.') + tagger.tag(text_obj) + if verbose: + print(f'Parsed {input_path!r} with the ensemble tagger.') + output_dir, output_fname = os.path.split(output_path) + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + write_estnltk_text_to_conll(text_obj, tagger.output_layer, output_path) + +def create_stanza_tagger_loader( tagger_path, model_path, input_morph_layer, use_gpu=False, seed=None ): + '''Creates estnltk's TaggerLoader for customized importing of StanzaSyntaxTagger.''' + from estnltk_core.taggers import TaggerLoader + parameters={ 'input_morph_layer': input_morph_layer, + 'input_type': input_morph_layer, + 'depparse_path': model_path, + 'use_gpu': use_gpu } + if isinstance(seed, int): + parameters['random_pick_seed'] = seed + return TaggerLoader( 'stanza_syntax', + ['sentences', input_morph_layer, 'words'], + tagger_path, + output_attributes=('id', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc'), + parameters=parameters ) + +def create_stanza_ensemble_tagger_loader( tagger_path, model_paths, input_morph_layer, use_majority_voting=False, + use_gpu=False, seed=None, scores_seed=None ): + '''Creates estnltk's TaggerLoader for customized importing of StanzaSyntaxEnsembleTagger.''' + from estnltk_core.taggers import TaggerLoader + parameters={ 'input_morph_layer': input_morph_layer, + 'model_paths': model_paths, + 'use_gpu': use_gpu } + if isinstance(seed, int): + parameters['random_pick_seed'] = seed + if isinstance(scores_seed, int): + parameters['random_pick_max_score_seed'] = scores_seed + if use_majority_voting: + parameters['aggregation_algorithm'] = 'majority_voting' + return TaggerLoader( 'stanza_ensemble_syntax', + ['sentences', input_morph_layer, 'words'], + tagger_path, + output_attributes=('id', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc'), + parameters=parameters ) + +def write_stanza_doc_to_conll(doc, output_path): + '''Writes given stanza Document to CoNLLU format output file.''' + conll = CoNLL.convert_dict(doc.to_dict()) + with open(output_path, 'w', encoding='utf-8') as fout: + for sentence in conll: + for word in sentence: + fout.write('\t'.join(word) + '\n') + fout.write('\n') + fout.write('\n' * 2) + +def write_estnltk_text_to_conll(text, syntax_layer, output_path): + '''Writes given estnltk's Text with syntax_layer to CoNLLU format output file.''' + from estnltk.converters.conll.conll_exporter import layer_to_conll + text_conll_str = \ + layer_to_conll(text, syntax_layer, preserve_ambiguity=False) + with open(output_path, 'w', encoding='utf-8') as fout: + fout.write( text_conll_str ) + fout.write('\n') + layer = text[syntax_layer] + if 'entropy' in layer.attributes: + if output_path.endswith('.conllu'): + # Write out prediction entropy results + output_path_entropy = output_path.replace('.conllu', '.entropy') + last_word_id = None + with open(output_path_entropy, 'w', encoding='utf-8') as fout: + for syntax_word in layer: + ann = syntax_word.annotations[0] + cur_word_id = int(ann['id']) + if last_word_id is not None and cur_word_id == 1: + # add sentence break + fout.write('\n') + fout.write( str(ann['votes'])+'\t'+str(ann['entropy']) ) + fout.write('\n') + last_word_id = cur_word_id + fout.write('\n' * 2) + else: + warnings.warn( f'(!) Unexpected file ending in {output_path} (expected .conllu), '+\ + 'skipping entropy file creation.' ) + +# ======================================================================== + +if __name__ == '__main__': + if len(sys.argv) < 2: + raise Exception('(!) Missing input argument: name of the configuration INI file.') + conf_file = sys.argv[1] + subexp = None + if len(sys.argv) > 2: + subexp = sys.argv[2] + run_models_main( conf_file, subexp=subexp ) \ No newline at end of file diff --git a/04b_predict_malt_udpipe.py b/04b_predict_malt_udpipe.py new file mode 100644 index 00000000..075df34c --- /dev/null +++ b/04b_predict_malt_udpipe.py @@ -0,0 +1,594 @@ +# +# Applies MaltParser/UDPipe1 models to get predictions. +# Implemented settings: +# * full_data +# * multi_experiment (general) +# * crossvalidation +# * half_data +# * smaller_data +# +import os, os.path +import re +import sys +import shutil +from datetime import datetime +import subprocess +from collections import defaultdict +from decimal import Decimal, getcontext +from random import Random + +import conllu +import configparser + +# Change to local paths & files, if required +DEFAULT_MALTPARSER_DIR = 'MaltOptimizer-1.0.3' +DEFAULT_MALTPARSER_JAR = 'maltparser-1.9.2.jar' +#DEFAULT_UDPIPE_DIR = 'udpipe-1.2.0-bin\\bin-win64' +DEFAULT_UDPIPE_DIR = 'udpipe-1.2.0-bin/bin-linux64' + +def run_models_main( conf_file, subexp=None, dry_run=False ): + ''' + Runs MaltParser/UDPipe-1 models to get predictions based on + the configuration. + Settings/parameters of running models will be read from + the given `conf_file`. + Executes sections in the configuration starting with prefix + 'predict_malt_' and 'predict_udpipe1_'. + + Optinally, if `subexp` is defined, then predicts only + that sub-experiment and skips all other sub-experiments (in + crossvalidation, smaller_data and half_data experiments). + ''' + # Parse configuration file + config = configparser.ConfigParser() + if conf_file is None or not os.path.exists(conf_file): + raise FileNotFoundError("Config file {} does not exist".format(conf_file)) + if len(config.read(conf_file)) != 1: + raise ValueError("File {} is not accessible or is not in valid INI format".format(conf_file)) + section_found = False + for section in config.sections(): + if section.startswith('predict_malt_') or section.startswith('predict_udpipe1_'): + parser = 'maltparser' if section.startswith('predict_malt_') else 'udpipe1' + section_found = True + subexp_str = '' if subexp is None else f' ({subexp})' + print(f'Running {section}{subexp_str} ...') + experiment_type = config[section].get('experiment_type', 'full_data') + experiment_type_clean = (experiment_type.strip()).lower() + if experiment_type_clean not in ['full_data', 'crossvalidation', 'half_data', 'smaller_data', 'multi_experiment']: + raise ValueError('(!) Unexpected experiment_type value: {!r}'.format(experiment_type)) + if experiment_type_clean == 'full_data': + # ------------------------------------------ + # 'full_data' + # ------------------------------------------ + # train_file with path + if not config.has_option(section, 'train_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "train_file" parameter.') + train_file = config[section]['train_file'] + if not os.path.isfile(train_file): + raise FileNotFoundError(f'Error in {conf_file}: invalid "train_file" value {train_file!r} in {section!r}.') + # test_file with path + if not config.has_option(section, 'test_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "test_file" parameter.') + test_file = config[section]['test_file'] + if not os.path.isfile(test_file): + raise FileNotFoundError(f'Error in {conf_file}: invalid "test_file" value {test_file!r} in {section!r}.') + # output_dir + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_dir = config[section]['output_dir'] + output_prefix = config[section].get('output_file_prefix', 'predicted_') + if not output_prefix.endswith('_'): + output_prefix += '_' + # use multiple models as an ensemble + use_ensemble = config[section].getboolean('use_ensemble', False) + use_majority_voting = config[section].getboolean('use_majority_voting', False) + aggregation_algorithm = 'las_coherence' if not use_majority_voting else 'majority_voting' + scores_seed = config[section].getint('scores_seed', 3) + # Get model file or files + model_file = None + model_files = [] + if use_ensemble: + # predict with ensemble: get models_dir + if not config.has_option(section, 'models_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "models_dir" parameter.') + models_dir = config[section]['models_dir'] + if not os.path.isdir(models_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "models_dir" value {models_dir!r} in {section!r}.') + # collect all model files from the directory + if parser == 'maltparser': + model_file_name_pattern = 'model_(.+).mco' + else: + model_file_name_pattern = 'model_(.+).udpipe' + model_file_name_pattern = re.compile(model_file_name_pattern) + for fname in os.listdir(models_dir): + if model_file_name_pattern.match(fname): + model_files.append( os.path.join(models_dir, fname) ) + if len(model_files) == 0: + raise Exception( f'Error in {conf_file}: section {section!r}: Did not find any model files for '+\ + 'the ensemble tagger from models_dir={models_dir!r}.' ) + else: + # predict with a single model: get model file with path + default_model = 'model.mco' if parser == 'maltparser' else 'model.udpipe' + model_file = config[section].get('model_file', default_model) + if not os.path.isfile(model_file): + raise FileNotFoundError(f'Error in {conf_file}: invalid "model_file" value {model_file!r} in {section!r}.') + # other parameters + dry_run = config[section].getboolean('dry_run', dry_run) + # MaltParser options + maltparser_dir = config[section].get('maltparser_dir', DEFAULT_MALTPARSER_DIR) + maltparser_jar = config[section].get('maltparser_jar', DEFAULT_MALTPARSER_JAR) + # UDPipe-1 options + udpipe_dir = config[section].get('udpipe_dir', DEFAULT_UDPIPE_DIR) + if not dry_run: + if parser == 'maltparser': + # Predict on train + output_file = f'{output_prefix}train.conllu' + if not use_ensemble: + predict_maltparser(model_file, train_file, output_file, output_dir, + maltparser_dir=maltparser_dir, + maltparser_jar=maltparser_jar) + else: + predict_ensemble(parser, model_files, train_file, output_file, output_dir, + aggregation_algorithm = aggregation_algorithm, + random_pick_max_score_seed = scores_seed, + maltparser_dir=maltparser_dir, + maltparser_jar=maltparser_jar) + # Predict on test + output_file = f'{output_prefix}test.conllu' + if not use_ensemble: + predict_maltparser(model_file, test_file, output_file, output_dir, + maltparser_dir=maltparser_dir, + maltparser_jar=maltparser_jar) + else: + predict_ensemble(parser, model_files, test_file, output_file, output_dir, + aggregation_algorithm = aggregation_algorithm, + random_pick_max_score_seed = scores_seed, + maltparser_dir=maltparser_dir, + maltparser_jar=maltparser_jar) + elif parser == 'udpipe1': + # Predict on train + output_file = f'{output_prefix}train.conllu' + if not use_ensemble: + predict_udpipe1(model_file, train_file, output_file, output_dir, + udpipe_dir=udpipe_dir) + else: + predict_ensemble(parser, model_files, train_file, output_file, output_dir, + aggregation_algorithm = aggregation_algorithm, + random_pick_max_score_seed = scores_seed, + udpipe_dir=udpipe_dir) + # Predict on test + output_file = f'{output_prefix}test.conllu' + if not use_ensemble: + predict_udpipe1(model_file, test_file, output_file, output_dir, + udpipe_dir=udpipe_dir) + else: + predict_ensemble(parser, model_files, test_file, output_file, output_dir, + aggregation_algorithm = aggregation_algorithm, + random_pick_max_score_seed = scores_seed, + udpipe_dir=udpipe_dir) + else: + raise Exception(f'Unexpected parser name: {parser!r}') + elif experiment_type_clean in ['crossvalidation', 'half_data', 'smaller_data', 'multi_experiment']: + # ------------------------------------------ + # 'multi_experiment' (general) + # 'crossvalidation' + # 'half_data' + # 'smaller_data' + # ------------------------------------------ + # input_dir + if not config.has_option(section, 'input_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "input_dir" parameter.') + input_dir = config[section]['input_dir'] + if not os.path.isdir(input_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "input_dir" value {input_dir!r} in {section!r}.') + # output_dir + if not config.has_option(section, 'output_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "output_dir" parameter.') + output_dir = config[section]['output_dir'] + # models_dir + if not config.has_option(section, 'models_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "models_dir" parameter.') + models_dir = config[section]['models_dir'] + if not os.path.isdir(models_dir): + raise FileNotFoundError(f'Error in {conf_file}: invalid "models_dir" value {models_dir!r} in {section!r}.') + models_dir_files = [ fname for fname in os.listdir(models_dir) ] + if len(models_dir_files) == 0: + raise Exception(f'(!) No files found from models_dir {models_dir!r}') + # test_file with full path, or a pattern for finding test file from input_dir + if not config.has_option(section, 'test_file'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "test_file" parameter.') + test_file = config[section]['test_file'] + test_file_is_pattern = config[section].getboolean('test_file_is_pattern', False) + if not test_file_is_pattern and not os.path.isfile(test_file): + raise FileNotFoundError(f'Error in {conf_file}: invalid "test_file" value {test_file!r} in {section!r}.') + # Common options + dry_run = config[section].getboolean('dry_run', dry_run) + output_file_prefix = config[section].get('output_file_prefix', 'predicted_') + if not output_file_prefix.endswith('_'): + output_file_prefix += '_' + # Train file pattern + train_file_pat = r'(?P\d+)_train.conllu' + train_file_re = None + if config.has_option(section, 'train_file_pat'): + train_file_pat = config[section]['train_file_pat'] + train_file_re = _create_regexp_pattern(train_file_pat, 'train_file_pat') + # Test file pattern (if provided) + test_file_re = None + if test_file_is_pattern: + test_file_re = _create_regexp_pattern(test_file, 'test_file') + # skip_train: do not predict on train files + skip_train = config[section].getboolean('skip_train', False) + # test_matrix prediction mode: run all models on all test files + test_matrix = config[section].getboolean('test_matrix', False) + if test_matrix and test_file_re is None: + raise ValueError(f'(!) test_matrix can only be used if test file name is a regular expression') + # MaltParser options + maltparser_dir = config[section].get('maltparser_dir', DEFAULT_MALTPARSER_DIR) + maltparser_jar = config[section].get('maltparser_jar', DEFAULT_MALTPARSER_JAR) + # UDPipe-1 options + udpipe_dir = config[section].get('udpipe_dir', DEFAULT_UDPIPE_DIR) + # Collect input data + # Collect all train files + all_train_files = {} + for in_fname in sorted(os.listdir(input_dir)): + if in_fname.endswith('.conllu'): + m1 = train_file_re.match(in_fname) + if m1: + train_fpath = os.path.join(input_dir, in_fname) + subexp = m1.group('exp') + all_train_files[subexp] = train_fpath + # Collect all test files + all_test_files = {} + if test_file_re is not None: + # If test_file regex is provided, then collect test files via regexp + for in_fname in sorted(os.listdir(input_dir)): + if in_fname.endswith('.conllu'): + m2 = test_file_re.match(in_fname) + if m2: + test_file_subexp = m2.group('exp') + assert test_file_subexp not in all_test_files.keys(), \ + f'Duplicate test files for experiment {test_file_subexp}' + all_test_files[test_file_subexp] = \ + os.path.join(input_dir, in_fname) + else: + # If test_file regex is missing, assign a single test file for + # all experiments (global testing) + for cur_subexp in sorted( all_train_files.keys() ): + all_test_files[cur_subexp] = test_file + # Sanity checks + if len(all_test_files.keys()) > 0 and len(all_train_files.keys()) > 0: + if not (all_train_files.keys() == all_test_files.keys()): + raise ValueError('(!) Mismatching train and test sub-experiment '+\ + f'names. Train experiments: {all_train_files.keys()!r}; '+\ + f'Test experiments: {all_test_files.keys()!r} ') + elif len(all_test_files.keys()) == 0 and len(all_train_files.keys()) == 0: + raise ValueError(f'(!) No train or test files found from {input_dir}') + # + # Iterate over input files and predict + # + for cur_subexp in sorted( all_test_files.keys() ): + cur_test_file = all_test_files[cur_subexp] + cur_train_file = all_train_files.get(cur_subexp, None) + assert cur_train_file is not None or test_matrix + if parser == 'maltparser': + model_file = f'model_{cur_subexp}.mco' + else: + model_file = f'model_{cur_subexp}.udpipe' + # Try to find corresponding model from the models subdirectory + if model_file not in models_dir_files: + raise Exception(f'(!) Could not find model file {model_file!r} for experiment '+\ + f'{cur_subexp!r} from {models_dir!r}.') + model_path = os.path.join(models_dir, model_file) + # Run model for predictions + if not dry_run: + train_output_file = f'{output_file_prefix}train_{cur_subexp}.conllu' + test_output_file = f'{output_file_prefix}test_{cur_subexp}.conllu' + if parser == 'maltparser': + # Predict on train data (optional, can be skipped) + if cur_train_file is not None and not skip_train: + predict_maltparser(model_path, cur_train_file, train_output_file, output_dir, + maltparser_dir=maltparser_dir, + maltparser_jar=maltparser_jar) + # Predict on test + if not test_matrix: + predict_maltparser(model_path, cur_test_file, test_output_file, output_dir, + maltparser_dir=maltparser_dir, + maltparser_jar=maltparser_jar) + else: + # predict on all test files + for test_subexp in sorted(all_test_files.keys()): + test_output_fpath = os.path.join(output_path, \ + f'{output_file_prefix}model_{cur_subexp}_test_{test_subexp}.conllu') + cur_test_file = all_test_files[test_subexp] + predict_maltparser(model_path, cur_test_file, test_output_fpath, output_dir, + maltparser_dir=maltparser_dir, + maltparser_jar=maltparser_jar) + elif parser == 'udpipe1': + # Predict on train data (optional, can be skipped) + if cur_train_file is not None and not skip_train: + predict_udpipe1(model_path, cur_train_file, train_output_file, output_dir, + udpipe_dir=udpipe_dir) + # Predict on test + if not test_matrix: + predict_udpipe1(model_path, cur_test_file, test_output_file, output_dir, + udpipe_dir=udpipe_dir) + else: + # predict on all test files + for test_subexp in sorted(all_test_files.keys()): + test_output_fpath = os.path.join(output_path, \ + f'{output_file_prefix}model_{cur_subexp}_test_{test_subexp}.conllu') + cur_test_file = all_test_files[test_subexp] + predict_udpipe1(model_path, cur_test_file, test_output_fpath, output_dir, + udpipe_dir=udpipe_dir) + else: + raise Exception(f'Unexpected parser name: {parser!r}') + if not section_found: + print(f'No section starting with "predict_malt_" or "predict_udpipe1_" in {conf_file}.') + + +def _create_regexp_pattern(fpattern, pattern_var_name): + # Convert file pattern to regular experssion + if not isinstance(fpattern, str): + raise TypeError(f'{pattern_var_name} must be a string') + regexp = None + try: + regexp = re.compile(fpattern) + except Exception as err: + raise ValueError(f'Unable to convert {fpattern!r} to regexp') from err + if 'exp' not in regexp.groupindex: + raise ValueError(f'Regexp {fpattern!r} is missing named group "exp"') + return regexp + + +# =============================================================== +# Predict MaltParser +# =============================================================== + +def check_maltparser_requirements(maltparser_dir=DEFAULT_MALTPARSER_DIR, + maltparser_jar=DEFAULT_MALTPARSER_JAR): + ''' + Check that MaltParser's required folders and jar files are present. + Raises an expection if anything is missing. + ''' + if not os.path.isdir(maltparser_dir): + raise Exception( ('Missing directory: \%s. Please get MaltParser from: https://maltparser.org/index.html') % (maltparser_dir) ) + malt_dir_files = list(os.listdir(maltparser_dir)) + if maltparser_jar not in malt_dir_files: + jar_path = os.path.join(maltparser_dir, maltparser_jar) + raise Exception( ('Missing jar file: \%s. Please get MaltParser from: https://maltparser.org/index.html') % (jar_path) ) + if 'lib' not in malt_dir_files: + lib_path = os.path.join(maltparser_dir, 'lib') + raise Exception( ('Missing java libraries dir: \%s. Please get MaltParser from: https://maltparser.org/index.html') % (lib_path) ) + return True + +def predict_maltparser(model_path, test_corpus, output_file, output_dir, maltparser_dir=DEFAULT_MALTPARSER_DIR, + maltparser_jar=DEFAULT_MALTPARSER_JAR): + ''' + Runs MaltParser on given test_corpus to get predictions and saves results as conllu into output_file. + output_file should be a file name, use output_dir to specify its location. + ''' + check_maltparser_requirements(maltparser_dir=maltparser_dir, maltparser_jar=maltparser_jar) + # Make input file paths absolute + if test_corpus != os.path.abspath(test_corpus): + test_corpus = os.path.abspath(test_corpus) + if model_path != os.path.abspath(model_path): + model_path = os.path.abspath(model_path) + # Note: Maltparser's model must be at the same directory as maltparser jar, + # otherwise we'll run into error "Couldn't find the MaltParser configuration + # file". Copy the model. + model_copied = False + model_dir, model_name = os.path.split(model_path) + if len(model_dir) > 0 and model_dir not in maltparser_dir: + shutil.copyfile(model_path, os.path.join(maltparser_dir, model_name)) + model_copied = True + # Construct command + predict_command = \ + ('java -jar {jar} -c {model} -i {test_corpus} -o {output_file} -m parse').\ + format(jar=maltparser_jar, model=model_name, test_corpus=test_corpus, output_file=output_file) + # Execute + subprocess.call(predict_command, shell=True, cwd=maltparser_dir) + # Remove copied model + if model_copied: + os.remove( os.path.join(maltparser_dir, model_name) ) + # Relocate output + if output_dir is not None: + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + # Remove old file + if os.path.exists(os.path.join(output_dir, output_file)): + os.remove(os.path.join(output_dir, output_file)) + # Move predicted file to output dir + os.rename(os.path.join(maltparser_dir, output_file), + os.path.join(output_dir, output_file)) + +# =============================================================== +# Predict UDPipe1 +# =============================================================== + +def check_if_udpipe_is_in_path(udpipe_cmd='udpipe'): + ''' Checks whether given udpipe is in system's PATH. Returns True, there is + a file with given name (udpipe_cmd) in the PATH, otherwise returns False; + The idea borrows from: http://stackoverflow.com/a/377028 + ''' + if os.getenv("PATH") == None: + return False + for path in os.environ["PATH"].split(os.pathsep): + path1 = path.strip('"') + file1 = os.path.join(path1, udpipe_cmd) + if os.path.isfile(file1) or os.path.isfile(file1 + '.exe'): + return True + return False + +def predict_udpipe1(model_path, test_corpus, output_file, output_dir, udpipe_dir=DEFAULT_UDPIPE_DIR): + ''' + Runs UDPipe-1 on given test_corpus to get predictions and saves results as conllu into output_file. + ''' + udpipe_dir_exists = udpipe_dir is not None and os.path.isdir(udpipe_dir) + udpipe_is_in_path = check_if_udpipe_is_in_path() + if not udpipe_dir_exists and not udpipe_is_in_path: + raise Exception('(!) Could not find UDPipe executable. '+\ + 'Please make sure udpipe is installed and available in system PATH. '+\ + 'Or, alternatively, provide location of UDPipe via variable udpipe_dir. '+\ + 'You can download udpipe from: https://ufal.mff.cuni.cz/udpipe/1/') + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + udpipe_cmd = 'udpipe' + if udpipe_dir_exists: + udpipe_cmd = os.path.join(udpipe_dir, udpipe_cmd) + output_path = os.path.join(output_dir, output_file) + predict_command = \ + ('{udpipe_cmd} --parse {model_path} {test_corpus} --output=conllu --outfile={output_path} ').\ + format(udpipe_cmd=udpipe_cmd, model_path=model_path, test_corpus=test_corpus, + output_path=output_path) + # Execute + subprocess.call(predict_command, shell=True) + +# =============================================================== +# Predict with MaltParser or UDPipe1 ensemble +# =============================================================== + +def predict_ensemble(parser, model_files, test_corpus, output_file, output_dir, aggregation_algorithm = 'las_coherence', + random_pick_max_score_seed = 3, + udpipe_dir=DEFAULT_UDPIPE_DIR, + maltparser_dir=DEFAULT_MALTPARSER_DIR, + maltparser_jar=DEFAULT_MALTPARSER_JAR): + ''' + Runs an ensemble of MaltParser or UDPipe1 models on given test_corpus to get predictions and saves + results as conllu into output_file. + output_file should be a file name, use output_dir to specify its location. + ''' + assert parser in ['maltparser', 'udpipe1'] + assert aggregation_algorithm in ['las_coherence', 'majority_voting'] + # 1) Collect predictions from all of the models + temp_prediction_files = [] + for model_id, model_file in enumerate(model_files): + if parser == 'maltparser': + # Predict on train + temp_output_file = f'temp_malt_predict_{model_id}.conllu' + predict_maltparser(model_file, test_corpus, temp_output_file, output_dir, + maltparser_dir=maltparser_dir, + maltparser_jar=maltparser_jar) + elif parser == 'udpipe1': + # Predict on train + temp_output_file = f'temp_udpipe_predict_{model_id}.conllu' + predict_udpipe1(model_file, test_corpus, temp_output_file, output_dir, + udpipe_dir=udpipe_dir) + output_fpath = os.path.join(output_dir, temp_output_file) + temp_prediction_files.append( output_fpath ) + # 2) Load corresponding predicted conllu contents + predicted_docs = [] + for temp_output_file in temp_prediction_files: + assert os.path.exists(temp_output_file), \ + f'(!) Missing {parser} output file {temp_output_file!r}.' + with open(temp_output_file, 'r', encoding='utf-8') as conllu_file: + predicted_docs.append( conllu.parse(conllu_file.read()) ) + # 2.x) Validate that all docs have equal number of sentences + number_of_sentences = 0 + for i in range(0, len(predicted_docs), 2): + if i+1 < len( predicted_docs ): + doc1_file = temp_prediction_files[i] + doc2_file = temp_prediction_files[i+1] + doc1 = predicted_docs[i] + doc2 = predicted_docs[i+1] + if len(doc1) != len(doc2): + raise ValueError( f'(!) Number of sentences differ in predicted output files: '+\ + f' {doc1_file}: {len(doc1)} vs {doc2_file}: {len(doc2)}.' ) + number_of_sentences = len(doc1) + # Random generator for choosing one dependency label if there are multiple labes with maximum scores + random_shuffler = Random() + random_shuffler.seed(random_pick_max_score_seed) + # 3) Iterate over all sentences and get aggregate predictions of each sentence + output_doc = [] + for sent_id in range(number_of_sentences): + all_sent_predictions = [] + for doc in predicted_docs: + all_sent_predictions.append(doc[sent_id]) + if aggregation_algorithm == 'las_coherence': + # Find pairwise las scores for all sentences + lases_table = defaultdict(dict) + for model_a, sent_a in enumerate( all_sent_predictions ): + for model_b, sent_b in enumerate( all_sent_predictions ): + lases_table[model_a][model_b] = sentence_LAS(sent_a, sent_b) + # Find average LAS for each model + sent_scores = dict() + getcontext().prec = 4 + for base_model, score in lases_table.items(): + decimals = list(map(Decimal, score.values())) + avg_score = sum(decimals) / Decimal(len(all_sent_predictions)) + sent_scores[base_model] = avg_score + # Pick sentence with the highest avg LAS (the highest coherence) + max_score = max(sent_scores.values()) + max_score_count = 0 + max_score_models = [] + for model, score in sent_scores.items(): + if score == max_score: + max_score_count += 1 + max_score_models.append(model) + random_shuffler.shuffle( max_score_models ) + output_doc.append ( all_sent_predictions[ max_score_models[0] ] ) + elif aggregation_algorithm == 'majority_voting': + sentence_length = len(all_sent_predictions[0]) + extracted_words = [] + # Get deprel with maximal votes for each token + for token_id in range(sentence_length): + voting_table = defaultdict(int) + label_token_map = {} + for sentence in all_sent_predictions: + token = sentence[token_id] + label = '{}__{}'.format(token['deprel'], token['head']) + voting_table[label] += 1 + if label not in label_token_map.keys(): + label_token_map[label] = [] + label_token_map[label].append(token) + # Find maximum voting score and corresponding tokens + max_votes = max( voting_table.values() ) + max_votes_labels = [l for l, v in voting_table.items() if v==max_votes] + max_votes_tokens = [] + for label, tokens in label_token_map.items(): + if label in max_votes_labels: + max_votes_tokens.extend(tokens) + # In case of a tie, pick a token randomly + random_shuffler.shuffle(max_votes_tokens) + extracted_words.append(max_votes_tokens[0]) + assert len(extracted_words) == sentence_length + # Construct new sentence + new_sentence = all_sent_predictions[0].copy() + for token_id in range(sentence_length): + new_sentence[token_id] = extracted_words[token_id].copy() + output_doc.append( new_sentence ) + assert len(output_doc) == number_of_sentences + # 4) Output picked sentences + final_output_fpath = os.path.join(output_dir, output_file) + with open(final_output_fpath, 'w', encoding='utf-8') as out_f: + for sentence in output_doc: + out_f.write( sentence.serialize() ) + # 5) Finally, remove conllu files with temporary predictions + for temp_file in temp_prediction_files: + os.remove(temp_file) + + +def sentence_LAS(sent1, sent2): + '''Calculates LAS between two conllu sentences.''' + wrong = 0 + correct = 0 + for tok1, tok2 in zip(sent1, sent2): + if tok1['xpos'] != 'Z': + if tok1['head'] == tok2['head'] and tok1['deprel'] == tok2['deprel']: + correct += 1 + else: + wrong += 1 + if wrong == 0 and correct == 0: + return 1 + else: + return correct / (correct + wrong) + +# ======================================================================== + +if __name__ == '__main__': + if len(sys.argv) < 2: + raise Exception('(!) Missing input argument: name of the configuration INI file.') + conf_file = sys.argv[1] + subexp = None + if len(sys.argv) > 2: + subexp = sys.argv[2] + run_models_main( conf_file, subexp=subexp ) diff --git a/05_evaluate.py b/05_evaluate.py new file mode 100644 index 00000000..97d793d0 --- /dev/null +++ b/05_evaluate.py @@ -0,0 +1,1359 @@ +# +# Looks through all experiment configurations, and +# performs evaluations described in configuration +# files: compares predicted files to gold standard +# files and finds LAS/UAS scores or different types +# of parsing errors (E1, E2, E3). +# +# Supported settings: +# * full_data +# * multi_experiment (general) +# * crossvalidation +# * half_data +# * smaller_data +# +# Requires EstNLTK version 1.7.2+. +# + +import sys +import csv, re +import os, os.path +from statistics import mean +import random +import configparser +import warnings + +import conllu + +import scipy.stats as scipy_stats +from numpy import array as n_array + +from estnltk.converters.conll.conll_importer import conll_to_text +from estnltk.converters.conll.conll_importer import add_layer_from_conll + + +def eval_main(conf_file, collected_results=None, ignore_missing=True, verbose=True, round=True, count_words=False): + ''' + Performs evaluations described in given conf_file. Executes sections starting with prefix + 'eval_'. If `ignore_missing` is set, then skips evaluation sections where input files (gold + standard or prediction files) are missing. + + Evaluation results (train and test LAS/UAS scores, and gaps between train and test LAS) + will be saved into dictionary collected_results. + Use parameter collected_results to overwrite the dictionary with your own (in order to + collect evaluation results over multiple configuration files). If parameter is None, a + new dictionary will be created. + + A special case: if the evaluation counts different types of parsing errors (the + configuration setting count_error_types=True), then the configuration must provide + output_csv_file and results will be saved right away into the given csv file; + in this case, nothing will be added to collected_results. + + Returns collected_results. + + :param conf_file: configuration (INI) file defining evaluations + :param collected_results: dictionary where to save evaluation results + :param ignore_missing: if True, then missing evaluation files will be ignored. + Otherwise, an exception will be raised in case of a missing file. + :param verbose: if True, then scores will be output to screen immediately after calculation. + :param round: if True, then rounds scores to 4 decimals; otherwise collects unrounded scores. + :param count_words: if True, then reports evaluation word counts (under keys 'train_words' + and 'test_words'). Note that evaluation excludes null nodes from scoring (optionally, + punctuation can also be excluded by setting exclude_punct=True in conf_file). + ''' + if collected_results is None: + collected_results = dict() + config = configparser.ConfigParser() + if len(config.read(conf_file)) != 1: + raise ValueError("File {} is not accessible or is not in valid INI format".format(conf_file)) + for section in config.sections(): + # look for 'eval_' sections + if section.startswith('eval_'): + conf_path, conf_fname = os.path.split(conf_file) + print(f'{conf_fname}: Checking {section} ...') + experiment_type = config[section].get('experiment_type', 'full_data') + experiment_type_clean = (experiment_type.strip()).lower() + if experiment_type_clean not in ['full_data', 'crossvalidation', 'half_data', 'smaller_data', 'multi_experiment']: + raise ValueError('(!) Unexpected experiment_type value: {!r}'.format(experiment_type)) + intermediate_results = [] + output_csv_file = None + test_matrix = False + if experiment_type_clean == 'full_data': + # -------------------------------------------------------------- + # 'full_data' + # -------------------------------------------------------------- + # ============================================================== + # Full-data experiment evaluation modes: + # + # Default mode: + # * eval on train file, compute LAS and UAS (if not skip_train) + # * eval on test file, compute LAS and UAS + # + # Error types mode: + # * eval on train file, count different types of errors (if not + # skip_train) + # * eval on test file, count different types of errors + # ============================================================== + # gold_test and predicted_test with paths + if not config.has_option(section, 'gold_test'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "gold_test" parameter.') + gold_test = config[section]['gold_test'] + gold_test_exists = os.path.exists(gold_test) + if not config.has_option(section, 'predicted_test'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "predicted_test" parameter.') + predicted_test = config[section]['predicted_test'] + predicted_test_exists = os.path.exists(predicted_test) + # skip_train: do not evaluate on train files + skip_train = config[section].getboolean('skip_train', False) + # gold_train and predicted_train with paths + if skip_train: + # ignore train files. empty values as placeholders + gold_train = ''; gold_train_exists=True + predicted_train = ''; predicted_train_exists=True + else: + # validate that train files have been provided + if not config.has_option(section, 'gold_train'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "gold_train" parameter.') + gold_train = config[section]['gold_train'] + gold_train_exists = os.path.exists(gold_train) + if not config.has_option(section, 'predicted_train'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "predicted_train" parameter.') + predicted_train = config[section]['predicted_train'] + predicted_train_exists = os.path.exists(predicted_train) + # other parameters + error_types_mode = config[section].getboolean('count_error_types', False) + param_count_words = config[section].getboolean('count_words', count_words) + experiment_name = config[section].get('name', section) + exclude_punct = config[section].getboolean('exclude_punct', False) + punct_tokens_file = config[section].get('punct_tokens_file', None) + punct_tokens_set = load_punct_tokens( punct_tokens_file ) # Attempt to load from file. If None, return empty set + output_csv_file = config[section].get('output_csv_file', None) # Output results to csv file right after evaluation + error_sample_size = config[section].getint('error_sample_size', 100) + add_conf_intervals = config[section].getboolean('add_conf_intervals', False) + calc_scores_with_entropy = config[section].getboolean('calc_scores_with_entropy', False) + output_train_error_sample_file = config[section].get('output_train_error_sample_file', None) + output_test_error_sample_file = config[section].get('output_test_error_sample_file', None) + if not error_types_mode: + if output_train_error_sample_file is not None: + raise ValueError(f'Error in {conf_file}, section {section!r}: error sampling (output_train_error_sample_file) '+\ + 'only works with option count_error_types=True.') + if output_test_error_sample_file is not None: + raise ValueError(f'Error in {conf_file}, section {section!r}: error sampling (output_test_error_sample_file) '+\ + 'only works with option count_error_types=True.') + if error_types_mode: + if add_conf_intervals: + raise ValueError(f'Error in {conf_file}, section {section!r}: add_conf_intervals only works with the option '+\ + 'count_error_types=False.') + if calc_scores_with_entropy: + raise ValueError(f'Error in {conf_file}, section {section!r}: calc_scores_with_entropy only works with '+\ + 'the option count_error_types=False.') + if calc_scores_with_entropy and add_conf_intervals: + raise ValueError(f'Error in {conf_file}, section {section!r}: calc_scores_with_entropy and add_conf_intervals '+\ + 'cannot be used simultaneously. Set only one of the two parameters.') + if output_csv_file is None and error_types_mode: + raise ValueError(f'Error in {conf_file} section {section!r}: '+\ + f'count_error_types requires setting "output_csv_file" parameter.') + # Sanity check to avoid accidental confusion with 'multi_experiment' + if config.has_option(section, 'test_matrix') and config[section].getboolean('test_matrix', False): + raise ValueError(f'Error in {conf_file}, section {section!r}: '+\ + f'test_matrix option works only with experiment_type = multi_experiment.') + all_files_exist = gold_train_exists and gold_test_exists and \ + predicted_train_exists and predicted_test_exists + if all_files_exist: + format_string = ':.4f' if round else None + if not error_types_mode: + # + # Default eval mode (calculate LAS and UAS) + # + results = score_experiment( predicted_test, gold_test, predicted_train, gold_train, + gold_path=None, predicted_path=None, format_string=format_string, + add_conf_intervals=add_conf_intervals, + calc_scores_with_entropy=calc_scores_with_entropy, + count_words=param_count_words, skip_train=skip_train, + exclude_punct=exclude_punct, punct_tokens_set=punct_tokens_set ) + if verbose: + print(results) + # find experiment directory closest to root in experiment path + exp_root = get_experiment_path_root(gold_test) + if exp_root not in collected_results.keys(): + collected_results[exp_root] = dict() + collected_results[exp_root][experiment_name] = results + intermediate_results.append( (experiment_name, None, None, results) ) + else: + # + # Error types eval mode (count E1, E2, E3) + # + exp_name_raw = experiment_name if experiment_name.endswith('_') else experiment_name+'_' + if not skip_train: + train_errors = calculate_errors(gold_train, predicted_train, punct_tokens_set=punct_tokens_set, + remove_empty_nodes=True, add_counts=param_count_words, + format_string=format_string, error_sample_size=error_sample_size, + error_sample_output_file=output_train_error_sample_file) + if verbose: + print(f'{exp_name_raw}on_train |', train_errors) + intermediate_results.append( (f'{exp_name_raw}on_train', None, None, train_errors) ) + test_errors = calculate_errors(gold_test, predicted_test, punct_tokens_set=punct_tokens_set, + remove_empty_nodes=True, add_counts=param_count_words, + format_string=format_string, error_sample_size=error_sample_size, + error_sample_output_file=output_test_error_sample_file) + if verbose: + print(f'{exp_name_raw}on_test |', test_errors) + intermediate_results.append( (f'{exp_name_raw}on_test', None, None, test_errors) ) + else: + missing_files = [f for f in [predicted_test, gold_test, predicted_train, gold_train] if not os.path.exists(f)] + if ignore_missing: + print(f'Skipping evaluation because of missing files: {missing_files!r}') + else: + raise FileNotFoundError(f'(!) Cannot evaluate, missing evaluation files: {missing_files!r}') + elif experiment_type_clean in ['crossvalidation', 'half_data', 'smaller_data', 'multi_experiment']: + # -------------------------------------------------------------- + # 'multi_experiment' (general) + # 'crossvalidation' + # 'half_data' + # 'smaller_data' + # -------------------------------------------------------------- + # ============================================================== + # Multi-experiment evaluation modes (LAS, UAS): + # + # Default mode: + # * eval each model on its train file (if not skip_train) + # * eval each model on its test file or on the global test file + # + # Test matrix mode: + # * eval each model on its train file (if not skip_train) + # * eval each model on all test files + # ============================================================== + # gold_test and gold_splits_dir + if not config.has_option(section, 'gold_test'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "gold_test" parameter.') + gold_test = config[section]['gold_test'] + test_file_is_pattern = config[section].getboolean('test_file_is_pattern', False) + if not config.has_option(section, 'gold_splits_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "gold_splits_dir" parameter.') + gold_splits_dir = config[section]['gold_splits_dir'] + if not config.has_option(section, 'predictions_dir'): + raise ValueError(f'Error in {conf_file}: section {section!r} is missing "predictions_dir" parameter.') + # Sanity check to avoid accidental confusion with 'full_data' experiments + error_types_mode = config[section].getboolean('count_error_types', False) + if error_types_mode: + raise NotImplementedError(f'Error in {conf_file}, section {section!r}: '+\ + f'count_error_types is not implemented for {experiment_type} evaluation.') + predictions_dir = config[section]['predictions_dir'] + predictions_prefix = config[section].get('predictions_prefix', 'predicted_') + macro_average = config[section].getboolean('macro_average', False) + param_count_words = config[section].getboolean('count_words', count_words) + experiment_name_prefix = config[section].get('name_prefix', section) + if not experiment_name_prefix.endswith('_'): + experiment_name_prefix = experiment_name_prefix + '_' + exclude_punct = config[section].getboolean('exclude_punct', False) + punct_tokens_file = config[section].get('punct_tokens_file', None) + punct_tokens_set = load_punct_tokens( punct_tokens_file ) # Attempt to load from file. If None, return empty set + output_csv_file = config[section].get('output_csv_file', None) # Output results to csv file right after evaluation + # Patterns for capturing names of training sub-experiment files + train_file_pat = r'(?P\d+)_train_all.conllu' + # Override sub-experiment patterns (if required) + if config.has_option(section, 'train_file_pat'): + train_file_pat = config[section]['train_file_pat'] + # Convert train_file_pattern to regular experssion + train_file_regex = None + if not isinstance(train_file_pat, str): + raise TypeError('train_file_pat must be a string') + try: + train_file_regex = re.compile(train_file_pat) + except Exception as err: + raise ValueError(f'Unable to convert {train_file_pat!r} to regexp') from err + if 'exp' not in train_file_regex.groupindex: + raise ValueError(f'Regexp {train_file_pat!r} is missing named group "exp"') + # skip_train: do not evaluate on train files + skip_train = config[section].getboolean('skip_train', False) + test_file_regex = None + if test_file_is_pattern: + if not isinstance(gold_test, str): + raise TypeError('gold_test must be a string') + try: + test_file_regex = re.compile(gold_test) + except Exception as err: + raise ValueError(f'Unable to convert {gold_test!r} to regexp') from err + if 'exp' not in test_file_regex.groupindex: + raise ValueError(f'Regexp {gold_test!r} is missing named group "exp"') + # test_matrix evaluation mode: evaluate all models on all test files + test_matrix = config[section].getboolean('test_matrix', False) + if macro_average and test_matrix: + raise Exception('macro_average not implemented for test_matrix evaluation mode') + # Validate main paths + # Find out missing paths + paths_to_check = [predictions_dir, gold_splits_dir] + if not test_file_is_pattern: + paths_to_check.append(gold_test) + missing_paths = [] + for input_path in paths_to_check: + if not os.path.exists(input_path): + missing_paths.append(input_path) + if missing_paths: + # ================================================== + # Report missing main paths + # ================================================== + if ignore_missing: + print(f'Skipping evaluation because of missing dirs/files: {missing_paths!r}') + else: + raise FileNotFoundError(f'(!) Cannot evaluate, missing evaluation dirs/files: {missing_paths!r}') + else: + # ================================================== + # Main paths are OK. Proceed with gathering files + # ================================================== + # Try to collect evaluation files + # Gold train files + all_gold_train_files = {} + for gold_file in sorted( os.listdir(gold_splits_dir) ): + if (gold_file.lower()).endswith('.conllu'): + m = train_file_regex.match(gold_file) + if m: + no = m.group('exp') + all_gold_train_files[no] = \ + os.path.join(gold_splits_dir, gold_file) + # Gold test files + all_gold_test_files = {} + if test_file_regex is not None: + # Multiple test files + for gold_file in sorted(os.listdir(gold_splits_dir)): + if (gold_file.lower()).endswith('.conllu'): + m2 = test_file_regex.match(gold_file) + if m2: + no2 = m2.group('exp') + all_gold_test_files[no2] = \ + os.path.join(gold_splits_dir, gold_file) + else: + # Single test file for all models + for cur_subexp in sorted( all_gold_train_files.keys() ): + all_gold_test_files[cur_subexp] = gold_test + # Exception if no gold train/test files are available + if len(all_gold_train_files.keys()) == 0: + raise Exception('(!) Unable to construct sub experiment names, '+\ + f'because no train files were found from {gold_splits_dir}. '+\ + f'Please check that {train_file_pat!r} is a proper regexp for '+\ + 'recognizing train files from the directory.') + # Some sanity checks + if len(all_gold_train_files.keys()) > 0 or len(all_gold_test_files.keys()) > 0: + if not skip_train: + # Train and test set names must match + if not (all_gold_train_files.keys() == all_gold_test_files.keys()): + raise ValueError('(!) Mismatching train and test sub-experiment '+\ + 'names in gold_splits_dir. Missing any files? '+\ + f'Train experiments: {list(all_gold_train_files.keys())!r}; '+\ + f'Test experiments: {list(all_gold_test_files.keys())!r} ') + else: + if not len(all_gold_test_files.keys()) > 0: + raise ValueError(f'(!) No test files found from {gold_splits_dir}') + elif len(all_gold_train_files.keys()) == 0 and len(all_gold_test_files.keys()) == 0: + raise ValueError(f'(!) No train or test files found from {gold_splits_dir}') + # ================================================== + # Iterate over sub experiments and evaluate + # ================================================== + evaluations_done = 0 + results_macro_avg = dict() + for cur_subexp in sorted( all_gold_test_files.keys() ): + current_gold_test = all_gold_test_files[cur_subexp] + gold_train = all_gold_train_files.get(cur_subexp, None) + assert gold_train is not None or test_matrix + missing_prediction_files = [] + # Find corresponding train prediction + target_predicted_train = f'{predictions_prefix}train_{cur_subexp}.conllu' + predicted_train = os.path.join(predictions_dir, target_predicted_train) + # Find corresponding test predictions + if test_matrix: + # Multiple test predictions (1 from each model) + predicted_test_files = [] + predicted_test_models = [] + for model_subexp in sorted(all_gold_test_files.keys()): + test_output_fpath = os.path.join(predictions_dir, \ + f'{predictions_prefix}model_{model_subexp}_test_{cur_subexp}.conllu') + predicted_test_files.append( test_output_fpath ) + predicted_test_models.append( model_subexp ) + else: + # Single test prediction + target_predicted_test = f'{predictions_prefix}test_{cur_subexp}.conllu' + predicted_test_files = [os.path.join(predictions_dir, target_predicted_test)] + predicted_test_models = [None] + # Check for existence of files + paths_to_check2 = [] + if not skip_train: + paths_to_check2.append( predicted_train ) + paths_to_check2.extend( predicted_test_files ) + for predicted_path in paths_to_check2: + if not os.path.exists(predicted_path) or \ + not os.path.isfile(predicted_path): + missing_prediction_files.append(predicted_path) + if len(missing_prediction_files) == 0: + # ================================================== + # All required files exist: evaluate + # ================================================== + for predicted_test, model_subexp in zip(predicted_test_files, predicted_test_models): + results = score_experiment( predicted_test, current_gold_test, + predicted_train, gold_train, + format_string=None, + count_words=param_count_words, + skip_train=skip_train, + exclude_punct=exclude_punct, + punct_tokens_set=punct_tokens_set ) + if macro_average: + # Collect macro averages + for k, v in results.items(): + if k not in results_macro_avg.keys(): + results_macro_avg[k] = [] + results_macro_avg[k].append(v) + if round: + # Find rounded results + format_string = ':.4f' + results_rounded = dict() + for k, v in results.items(): + if k not in ['train_words', 'test_words']: + results_rounded[k] = ('{'+format_string+'}').format(v) + else: + results_rounded[k] = '{}'.format(v) + results = results_rounded + # experiment name (depends on whether we use test matrix or not) + if not test_matrix: + experiment_name = f'{experiment_name_prefix}{cur_subexp}' + else: + experiment_name = f'{experiment_name_prefix}model_{model_subexp}_test_{cur_subexp}' + # find experiment directory closest to root in experiment path + exp_root = get_experiment_path_root(current_gold_test) + if exp_root not in collected_results.keys(): + collected_results[exp_root] = dict() + if verbose: + print(exp_root, experiment_name, results) + collected_results[exp_root][experiment_name] = results + intermediate_results.append( (experiment_name, cur_subexp, model_subexp, results) ) + evaluations_done += 1 + else: + # Missing files + if ignore_missing: + print(f'Skipping evaluation because of missing files: {missing_prediction_files!r}') + else: + raise FileNotFoundError(f'(!) Cannot evaluate, missing evaluation files: {missing_prediction_files!r}') + if macro_average and evaluations_done > 1: + # Find macro averages + calculated_averages = dict() + for k, v in results_macro_avg.items(): + calculated_averages[k] = mean(v) + if round: + if k not in ['train_words', 'test_words']: + calculated_averages[k] = \ + ('{'+format_string+'}').format( calculated_averages[k] ) + else: + calculated_averages[k] = \ + ('{}').format( int(calculated_averages[k]) ) + assert exp_root in collected_results.keys() + experiment_name = f'{experiment_name_prefix}{"AVG"}' + if verbose: + print(exp_root, experiment_name, calculated_averages) + collected_results[exp_root][experiment_name] = calculated_averages + intermediate_results.append( (experiment_name, None, None, calculated_averages) ) + # ================================================== + # Output intermediate results (optional) + # ================================================== + if output_csv_file is not None and intermediate_results: + print(f'Writing evaluation results into {output_csv_file} ...') + with open(output_csv_file, 'w', encoding='utf-8', newline='') as output_csv: + csv_writer = csv.writer(output_csv) + if test_matrix: + # Write matrix of results + # rows: test sets, columns: models + subexp_names = [] + for r in intermediate_results: + if r[1] not in subexp_names: + subexp_names.append(r[1]) + header = [''] + subexp_names + csv_writer.writerow( header ) + values = [] + lines_written = 0 + for (full_exp_name, exp1, exp2, results) in intermediate_results: + if (exp1 is not None) and (exp2 is not None): + if not values: + values.append(exp1) + values.append(results['LAS_test']) + assert exp1 == values[0] + assert exp2 == subexp_names[len(values)-2] + if len(values) == len(subexp_names) + 1: + csv_writer.writerow( values ) + lines_written += 1 + values = [] + assert lines_written == len(subexp_names) + else: + # Write listing of results + result_fields = list(intermediate_results[0][3].keys()) + header = ['experiment'] + result_fields + csv_writer.writerow( header ) + for (full_exp_name, exp1, exp2, results) in intermediate_results: + values = [full_exp_name] + for key in header[1:]: + values.append( results[key] ) + assert len(values) == len(header) + csv_writer.writerow( values ) + return collected_results + + +def get_experiment_path_root( experiment_path ): + '''Finds directory closest to the root from the given experiment path.''' + closest_to_root = None + while len(experiment_path) > 0: + head, tail = os.path.split( experiment_path ) + if len(head) == 0: + closest_to_root = tail + experiment_path = head + return closest_to_root + +def load_punct_tokens( fname ): + ''' + Loads set of punctuation tokens from given file. + If file name is None, returns an empty set. + If file is given but missing, raises an exception. + Returns set of punctuation tokens. + ''' + punct_tokens = set() + if fname is not None: + if not os.path.exists(fname): + raise Exception(f'(!) Non-existend punct tokens file: {fname!r}') + with open(fname, 'r', encoding='utf-8') as in_f: + for line in in_f: + line = line.strip() + if len(line) > 0: + punct_tokens.add(line) + return punct_tokens + +def score_experiment( predicted_test, gold_test, predicted_train, gold_train, + gold_path=None, predicted_path=None, format_string=None, + add_conf_intervals=False, calc_scores_with_entropy=False, + skip_train=False, count_words=False, + exclude_punct=False, punct_tokens_set=None ): + ''' + Calculates train and test LAS/UAS scores and gaps between train and test LAS + using given predicted and gold standard conllu files. + If `format_string` provided (not None), then uses it to reformat all calculated + scores. For instance, if `format_string=':.4f'`, then all scores will be rounded + to 4 decimals. + Returns dictionary with calculated scores (keys: "LAS_test", "LAS_train", + "LAS_gap", "UAS_test", "UAS_train"). + If `count_words=True`, then adds evaluation word counts (keys 'train_words' + and 'test_words') to the results. + If `skip_train` is True, then calculates only test scores and returns dictionary + with calculated test scores (keys: "LAS_test", "UAS_test"). + If `add_conf_intervals` is True, then scores will be computed as mean values with + 95% confidence intervals. Instead of a single value, each score will then be in + form "lower_bound; mean; upper_bound". + If `calc_scores_with_entropy` is True, then adds additional computations to + default scores: computes LAS exclusively over words that have ensemble prediction + entropy 0, and finds a correlation between ensemble prediction entropy and reversed + gold standard match (0-match, 1-mismatch). Note: this only works if ensemble + predictions have generated prediction .entropy files in addition to prediction + .conllu files. + ''' + # Check/validate input files and parameters + input_files = { \ + 'predicted_test': predicted_test, + 'gold_test': gold_test, + 'predicted_train': predicted_train, + 'gold_train': gold_train } + for name, fpath in input_files.items(): + if skip_train and name.endswith('_train'): + # skip evaluation on train files + continue + full_path = fpath + if fpath is None: + raise FileNotFoundError(f'(!) Unexpected None value for {name} file name.') + # Update full path (if required) + if name.startswith('gold_') and gold_path is not None: + full_path = os.path.join(gold_path, fpath) + if name.startswith('predicted') and predicted_path is not None: + full_path = os.path.join(predicted_path, fpath) + if not os.path.isfile(full_path): + raise FileNotFoundError(f'(!) {name} file cannot be found at {full_path!r}.') + input_files[name] = full_path + if calc_scores_with_entropy and add_conf_intervals: + raise ValueError('(!) Cannot use calc_scores_with_entropy and add_conf_intervals simultaneously. '+\ + 'Pick only one of the input parameters.') + # Calculate scores + if add_conf_intervals: + test_scores = calculate_scores_with_conf_interval(input_files['gold_test'], + input_files['predicted_test'], + N=10, + seed=1, + count_words=count_words, + exclude_punct=exclude_punct, + punct_tokens_set=punct_tokens_set) + elif calc_scores_with_entropy: + test_scores = calculate_scores_with_entropy(input_files['gold_test'], + input_files['predicted_test'], + exclude_punct=exclude_punct, + punct_tokens_set=punct_tokens_set) + else: + test_scores = calculate_scores(input_files['gold_test'], + input_files['predicted_test'], + count_words=count_words, + exclude_punct=exclude_punct, + punct_tokens_set=punct_tokens_set) + + if skip_train: + train_scores = None + results_dict = {'LAS_test' : test_scores['LAS'], + 'UAS_test' : test_scores['UAS']} + else: + if add_conf_intervals: + train_scores = calculate_scores_with_conf_interval(input_files['gold_train'], + input_files['predicted_train'], + N=10, + seed=1, + count_words=count_words, + exclude_punct=exclude_punct, + punct_tokens_set=punct_tokens_set) + elif calc_scores_with_entropy: + train_scores = calculate_scores_with_entropy(input_files['gold_train'], + input_files['predicted_train'], + exclude_punct=exclude_punct, + punct_tokens_set=punct_tokens_set) + else: + train_scores = calculate_scores(input_files['gold_train'], + input_files['predicted_train'], + count_words=count_words, + exclude_punct=exclude_punct, + punct_tokens_set=punct_tokens_set) + results_dict = {'LAS_test' : test_scores['LAS'], + 'LAS_train' : train_scores['LAS'], + 'UAS_test' : test_scores['UAS'], + 'UAS_train' : train_scores['UAS']} + if add_conf_intervals: + results_dict['LAS_gap'] = (train_scores['LAS'][0] - test_scores['LAS'][0], + train_scores['LAS'][1] - test_scores['LAS'][1], + train_scores['LAS'][2] - test_scores['LAS'][2]) + else: + results_dict['LAS_gap'] = train_scores['LAS'] - test_scores['LAS'] + # Add scores with entropy + if calc_scores_with_entropy: + # LAS with zero entropy + results_dict['test_LAS_0_entropy'] = \ + test_scores['LAS_with_0_entropy'] + results_dict['test_LAS_0_entropy_matches'] = \ + test_scores['LAS_with_0_entropy_matches'] + results_dict['test_LAS_0_entropy_total_words'] = \ + test_scores['LAS_with_0_entropy_total_words'] + # Size of LAS with zero entropy with respect to full test + if 'total_words' in test_scores: + results_dict['test_LAS_0_entropy_total_words_%'] = \ + (results_dict['test_LAS_0_entropy_total_words']/test_scores['total_words'])*100.0 + # Correlation between LAS match and entropy + results_dict['test_LAS_vs_entropy_corr'] = \ + test_scores['LAS_vs_entropy_correlation'] + results_dict['test_LAS_vs_entropy_corr_pvalue'] = \ + test_scores['LAS_vs_entropy_corr_pvalue'] + if train_scores is not None: + # LAS with zero entropy + results_dict['train_LAS_0_entropy'] = \ + train_scores['LAS_with_0_entropy'] + results_dict['train_LAS_0_entropy_matches'] = \ + train_scores['LAS_with_0_entropy_matches'] + results_dict['train_LAS_0_entropy_total_words'] = \ + train_scores['LAS_with_0_entropy_total_words'] + # Correlation between LAS match and entropy + results_dict['train_LAS_vs_entropy_corr'] = \ + train_scores['LAS_vs_entropy_correlation'] + results_dict['train_LAS_vs_entropy_corr_pvalue'] = \ + train_scores['LAS_vs_entropy_corr_pvalue'] + # Size of LAS with zero entropy with respect to full train + if 'total_words' in train_scores: + results_dict['train_LAS_0_entropy_total_words_%'] = \ + (results_dict['train_LAS_0_entropy_total_words']/train_scores['total_words'])*100.0 + # Reformat results + if format_string is not None: + for k, v in results_dict.items(): + if isinstance(v, int): + continue + elif isinstance(v, float): + results_dict[k] = ('{'+format_string+'}').format(v) + elif isinstance(v, tuple): + parts = [('{'+format_string+'}').format(vi) for vi in v] + results_dict[k] = '; '.join(parts) + else: + raise TypeError('(!) Value {!r} has unexpected type: {!r}'.format(v, type(v))) + if count_words: + if 'total_words' in test_scores: + results_dict['test_words'] = test_scores['total_words'] + if not skip_train and 'total_words' in train_scores: + results_dict['train_words'] = train_scores['total_words'] + return results_dict + + +def calculate_scores(gold_path: str, predicted_path: str, count_words=False, exclude_punct=False, punct_tokens_set=None): + ''' + Calculates LAS, UAS and LA scores based on gold annotations and predicted annotations + loaded from conllu files `gold_path` and `predicted_path`. + Assumes that there are no tokenization differences in the gold and predicted outputs. + If exclude_punct==True, then discards punctuation (tokens with xpos == 'Z', or alternatively, + tokens appearing in given punct_tokens_set) from calculations. + Always discards null nodes (tokens with non-integer id-s) from calculations. + Returns dictionary with scores (keys: "LAS", "UAS", "LA"). + If `count_words=True`, then adds evaluation word count (key 'total_words') to the results. + + Note: if exclude_punct==False, then the LAS calculated here is compatible with the LAS.f1 + calculated in CoNLL 2018 Shared Task. However, there's an exception: in the CoNLL 2018 + evaluation, language-specific deprel subtypes are truncated (e.g. 'acl:relcl' is reduced + to 'acl' in both predicted and gold), but we compare deprels as they are. As a result, + CoNLL 2018 evaluation can give higher scores than this method. + ''' + # Load annotated texts from conllu files + gold_sents = None + predicted_sents = None + with open(gold_path, 'r', encoding='utf-8') as in_f: + gold_sents = conllu.parse(in_f.read()) + with open(predicted_path, 'r', encoding='utf-8') as in_f_2: + predicted_sents = conllu.parse(in_f_2.read()) + assert len(gold_sents) == len(predicted_sents), \ + f'(!) Mismatching sizes: gold_sents: {len(gold_sents)}, predicted_sents: {len(predicted_sents)}' + + las_match_count = 0 + uas_match_count = 0 + la_match_count = 0 + total_words = 0 + + for i, gold_sentence in enumerate(gold_sents): + predicted_sentece = predicted_sents[i] + gold_sentence_words = [w['form'] for w in gold_sentence] + auto_sentence_words = [w['form'] for w in predicted_sentece] + assert gold_sentence_words == auto_sentence_words, \ + f'Tokenization mismatch in {predicted_path!r} | {gold_sentence_words} vs {auto_sentence_words}' + word_tracker = 0 + for gold_word in gold_sentence: + if not isinstance(gold_word['id'], int): + continue + if exclude_punct: + # Exclude punctuation from evaluation + if punct_tokens_set is not None: + if gold_word['form'] in punct_tokens_set: + word_tracker += 1 + continue + if gold_word['xpos'] == 'Z': + word_tracker += 1 + continue + + total_words += 1 + + predicted_word = predicted_sentece[word_tracker] + + if predicted_word['deprel'] == gold_word['deprel'] and predicted_word['head'] == gold_word['head']: + las_match_count += 1 + la_match_count += 1 + uas_match_count += 1 + elif predicted_word['deprel'] == gold_word['deprel']: + la_match_count += 1 + elif predicted_word['head'] == gold_word['head']: + uas_match_count += 1 + + word_tracker += 1 + result = \ + {'LAS': las_match_count / total_words, + 'UAS': uas_match_count / total_words, + 'LA': la_match_count / total_words} + if count_words: + result['total_words'] = total_words + return result + + +def calculate_scores_with_conf_interval(gold_path: str, predicted_path: str, N:int=10, + seed:int=1, confidence: float=0.95, count_words=False, + exclude_punct=False, punct_tokens_set=None): + ''' + A variant of `calculate_scores` that calculates LAS, UAS and LA scores with confidence intervals. + For that, the test sentences are shuffled and split into N sub sets, scores are measured on each + sub set, and, finally, the confidence intervals for means of scores are computed assuming a Normal + distribution. + + Returns dictionary with scores (keys: "LAS", "UAS", "LA"), in which each value is a triple: + (lower_bound, mean, upper_bound). + ''' + # Load annotated texts from conllu files + gold_sents = None + predicted_sents = None + with open(gold_path, 'r', encoding='utf-8') as in_f: + gold_sents = conllu.parse(in_f.read()) + with open(predicted_path, 'r', encoding='utf-8') as in_f_2: + predicted_sents = conllu.parse(in_f_2.read()) + assert len(gold_sents) == len(predicted_sents), \ + f'(!) Mismatching sizes: gold_sents: {len(gold_sents)}, predicted_sents: {len(predicted_sents)}' + # Pair all predicted and gold sentences + all_sentence_pairs = [] + for i, gold_sentence in enumerate(gold_sents): + predicted_sentece = predicted_sents[i] + all_sentence_pairs.append( (i, gold_sentence, predicted_sentece) ) + rnd = random.Random(seed) + # Randomize pairs and divide into N sub groups + rnd_groups = [] + rnd.shuffle( all_sentence_pairs ) + for i in range(N): + rnd_groups.append([]) + for pid, (j, gold_s, pred_s) in enumerate(all_sentence_pairs): + rnd_groups[pid % N].append( (j, gold_s, pred_s) ) + assert len(all_sentence_pairs) == sum([len(g) for g in rnd_groups]) + # Calculate LAS scores for sub groups + LA_scores = [] + UAS_scores = [] + LAS_scores = [] + abs_total_words = 0 + for gid, group in enumerate(rnd_groups): + las_match_count = 0 + uas_match_count = 0 + la_match_count = 0 + total_words = 0 # total words in the group + for i, (j, gold_sentence, predicted_sentece) in enumerate(group): + gold_sentence_words = [w['form'] for w in gold_sentence] + auto_sentence_words = [w['form'] for w in predicted_sentece] + assert gold_sentence_words == auto_sentence_words, \ + f'Tokenization mismatch in {predicted_path!r} | {gold_sentence_words} vs {auto_sentence_words}' + word_tracker = 0 + for gold_word in gold_sentence: + if not isinstance(gold_word['id'], int): + continue + if exclude_punct: + # Exclude punctuation from evaluation + if punct_tokens_set is not None: + if gold_word['form'] in punct_tokens_set: + word_tracker += 1 + continue + if gold_word['xpos'] == 'Z': + word_tracker += 1 + continue + total_words += 1 + predicted_word = predicted_sentece[word_tracker] + if predicted_word['deprel'] == gold_word['deprel'] and predicted_word['head'] == gold_word['head']: + las_match_count += 1 + la_match_count += 1 + uas_match_count += 1 + elif predicted_word['deprel'] == gold_word['deprel']: + la_match_count += 1 + elif predicted_word['head'] == gold_word['head']: + uas_match_count += 1 + word_tracker += 1 + # Calculate group scores + group_LAS = las_match_count / total_words + group_UAS = uas_match_count / total_words + group_LA = la_match_count / total_words + LA_scores.append( group_LA ) + UAS_scores.append( group_UAS ) + LAS_scores.append( group_LAS ) + abs_total_words += total_words + assert len(LA_scores) == N + assert len(LAS_scores) == N + assert len(UAS_scores) == N + # Calculate confidence intervals + LA_mean = mean(LA_scores) + LAS_mean = mean(LAS_scores) + UAS_mean = mean(UAS_scores) + LAS_lower, LAS_upper = scipy_stats.norm.interval(confidence, loc=LAS_mean, + scale=scipy_stats.sem(n_array(LAS_scores))) + LA_lower, LA_upper = scipy_stats.norm.interval(confidence, loc=LA_mean, + scale=scipy_stats.sem(n_array(LA_scores))) + UAS_lower, UAS_upper = scipy_stats.norm.interval(confidence, loc=UAS_mean, + scale=scipy_stats.sem(n_array(UAS_scores))) + result = \ + {'LAS': (LAS_lower, LAS_mean, LAS_upper), + 'UAS': (UAS_lower, UAS_mean, UAS_upper), + 'LA': (LA_lower, LA_mean, LA_upper)} + if count_words: + result['total_words'] = abs_total_words + return result + + +def entropy_file_exists(predicted_path: str): + ''' + Checks whether an .entropy file corresponding to the given .conllu file exists. + ''' + predicted_path_entropy = None + if predicted_path.endswith('.conllu'): + predicted_path_entropy = predicted_path.replace('.conllu', '.entropy') + return os.path.isfile(predicted_path_entropy) + return False + + +def calculate_scores_with_entropy(gold_path: str, predicted_path: str, exclude_punct=False, punct_tokens_set=None): + ''' + Calculates LAS, UAS and LA scores based on gold annotations and predicted annotations, + and extends the results with entropy scores. + For entropy scores, computes LAS exclusively over words that have ensemble prediction + entropy 0, and finds Pearson's correlation between the ensemble prediction entropy and a + reversed gold standard matching (encoded as: 0-match, 1-mismatch). Note: this function + only works if ensemble predictions have generated prediction .entropy files in addition + to prediction .conllu files. If .entropy files are missing, the function fails with an + exception. + Flags `exclude_punct` and `punct_tokens_set` behave the same way as in the function + `calculate_scores`. + ''' + from scipy.stats import pearsonr + # Load annotated texts from conllu files + gold_sents = None + predicted_sents = None + with open(gold_path, 'r', encoding='utf-8') as in_f: + gold_sents = conllu.parse(in_f.read()) + with open(predicted_path, 'r', encoding='utf-8') as in_f_2: + predicted_sents = conllu.parse(in_f_2.read()) + assert len(gold_sents) == len(predicted_sents), \ + f'(!) Mismatching sizes: gold_sents: {len(gold_sents)}, predicted_sents: {len(predicted_sents)}' + # Get entropy file path + predicted_path_entropy = None + if entropy_file_exists(predicted_path): + predicted_path_entropy = predicted_path.replace('.conllu', '.entropy') + else: + if not predicted_path.endswith('.conllu'): + warnings.warn( f'(!) Unexpected file ending in {predicted_path} (expected .conllu). ') + raise FileNotFoundError(\ + f"(!) Unable to find .entropy file corresponding to {predicted_path!r}") + # Load entropy annotations + sentence_id = 0 + words_entropy = [] + with open(predicted_path_entropy, 'r', encoding='utf-8') as in_f_3: + for line in in_f_3: + line_clean = line.strip() + if len(line_clean) > 0: + votes, entropy = line_clean.split('\t') + entropy = float(entropy) + words_entropy.append( \ + { 'sentence_id': sentence_id, + 'entropy': entropy} ) + else: + # Next sentence + sentence_id += 1 + entropy_all = [] + LAS_accuracy = [] + LAS_accuracy_with_zero_entropy = [] + UAS_accuracy = [] + LA_accuracy = [] + global_word_tracker = 0 + for i, gold_sentence in enumerate(gold_sents): + predicted_sentece = predicted_sents[i] + gold_sentence_words = [w['form'] for w in gold_sentence] + auto_sentence_words = [w['form'] for w in predicted_sentece] + assert gold_sentence_words == auto_sentence_words, \ + f'Tokenization mismatch in {predicted_path!r} | {gold_sentence_words} vs {auto_sentence_words}' + sentence_word_tracker = 0 + for gold_word in gold_sentence: + if not isinstance(gold_word['id'], int): + continue + if exclude_punct: + # Exclude punctuation from evaluation + if punct_tokens_set is not None: + if gold_word['form'] in punct_tokens_set: + global_word_tracker += 1 + sentence_word_tracker += 1 + continue + if gold_word['xpos'] == 'Z': + global_word_tracker += 1 + sentence_word_tracker += 1 + continue + cur_word_entropy = words_entropy[global_word_tracker]['entropy'] + predicted_word = predicted_sentece[sentence_word_tracker] + has_las_match = \ + predicted_word['deprel'] == gold_word['deprel'] and predicted_word['head'] == gold_word['head'] + has_la_match = \ + predicted_word['deprel'] == gold_word['deprel'] + has_uas_match = \ + predicted_word['head'] == gold_word['head'] + global_word_tracker += 1 + sentence_word_tracker += 1 + entropy_all.append(cur_word_entropy) + # Use encoding: 0-match, 1-mismatch in order + # to better align matching with the entropy + # ( H(X)=0 certainty, H(X)>0 uncertainty ) + LAS_accuracy.append(0 if has_las_match else 1) + LA_accuracy.append(0 if has_la_match else 1) + UAS_accuracy.append(0 if has_uas_match else 1) + if cur_word_entropy == 0.0: + # Use encoding: 0-mismatch, 1-match + LAS_accuracy_with_zero_entropy.append(1 if has_las_match else 0) + # Find correlation + corr, pvalue = pearsonr(entropy_all, LAS_accuracy) + result = \ + {'LAS': LAS_accuracy.count(0) / len(LAS_accuracy), + 'UAS': UAS_accuracy.count(0) / len(UAS_accuracy), + 'LA': LA_accuracy.count(0) / len(LA_accuracy), + # LAS with zero entropy + 'LAS_with_0_entropy_matches' : LAS_accuracy_with_zero_entropy.count(1), + 'LAS_with_0_entropy_total_words' : len(LAS_accuracy_with_zero_entropy), + 'LAS_with_0_entropy': \ + LAS_accuracy_with_zero_entropy.count(1) / len(LAS_accuracy_with_zero_entropy), + # Correlation between LAS match and entropy + 'LAS_vs_entropy_correlation': corr, + 'LAS_vs_entropy_corr_pvalue': pvalue, + # All words used in calculations + 'total_words': len(LAS_accuracy) + } + return result + + +def calculate_errors(gold_file, predicted_file, punct_tokens_set=None, remove_empty_nodes=True, + add_impact=True, add_rel_error=True, add_counts=True, root_outside_clause=True, + format_string=None, error_sample_size=None, error_sample_output_file=None): + ''' + Adds automatic clause annotations to the input text (via EstNLTK), and decomposes + errors made by the system according to dependency misplacements inside or outside + the clause. The following error types are distinguished: + + * E1 (local error): according to both gold standard and parser, the head of a word is + inside the same clause but gold standard and parser do not agree on the exact word + and/or deprel; + + * E2 (local-global error): according to gold standard, the head of a word is inside + the same clause, but parser thinks it is outside of the clause; + + * E3 (global error): according to gold standard, the word's head is outside of the + clause, and the parser got the head wrong (placed it either inside or outside the + clause); + + Discards punctuation (tokens with xpos == 'Z', or alternatively, tokens appearing + in given punct_tokens_set) from error calculations. + + Note: by default, root nodes (words with head==0) are always considered as being + "outside the clause". Use flag root_outside_clause=False to count root nodes as + being "inside the clause" (this will increase E1 errors while decreasing E2 and E3 + errors). + + Returns dictionary with calculated error counts ('E1', 'E2', 'E3'), and + additional statistics (see parameters add_impact, add_rel_error and add_counts + below for details). + + Requires EstNLTK version 1.7.2+. + + Parameters + ----------- + gold_file + CONLL-U format input file with gold standard syntax annotations + predicted_file + CONLL-U format input file with automatically predicted syntax annotations + punct_tokens_set + set with tokens that are considered as punctionation and that will be + discarded from evaluation. Undefined by default. + remove_empty_nodes + If True (default), then null / empty nodes (of the enhanced representation) + will be removed from input files (and discarded from calculations). + add_impact + If True (default), then calculates impacts and adds 'E1_impact', 'E2_impact', + and 'E3_impact' to the returned dictionary. Impact is the percentage of the + error from all errors. + add_rel_error + If True (default), then calculates relative errors and adds 'E1_rel_error', + 'E2_rel_error', and 'E3_rel_error' to the returned dictionary. Relative error + is the percentage from all arcs that can lead to given error type. + add_counts + If True (default), then adds token counts 'total_no_punct', 'correct', + 'gold_in_clause', 'gold_out_of_clause', 'total_words', 'punct', + 'unequal_length' 'E1_missed_root', 'E2_missed_root', 'E3_missed_root', + 'E2_with_E3', 'E2_without_E3' to the returned dictionary. + root_outside_clause: + If True (default), then root nodes (words with head==0) are always considered + as being "outside the clause". Otherwise, root nodes are considered as being + "inside the clause". + format_string + If `format_string` provided (not None), then uses it to reformat values of + impacts and relative errors. For instance, if `format_string=':.4f'`, then + impacts and relative errors will be rounded to 4 decimals. + error_sample_size + Optional. The number of erroneously parsed sentences to be randomly extracted + and saved into error_sample_output_file. + Only applies if both error_sample_size and error_sample_output_file are defined. + error_sample_output_file + Optional. A path to a output file into which randomly extracted erroneously + parsed sentences will be saved. + Only applies if both error_sample_size and error_sample_output_file are defined. + ''' + if punct_tokens_set is None: + punct_tokens_set = set() + # Load gold standard and automatic annotations into separate layers + text = conll_to_text(gold_file, syntax_layer='gold', remove_empty_nodes=remove_empty_nodes) + add_layer_from_conll(file=predicted_file, text=text, syntax_layer='parsed') + # Validate input sizes + assert len(text['gold']) == len(text['parsed']), \ + f"(!) Mismatching input sizes: gold_words: {len(text['gold'])}, predicted_words: {len(text['parsed'])}" + # Add automatic clauses annotation + text.tag_layer('clauses') + # Count errors + punct = 0 + e1 = 0 + e2 = 0 + e3 = 0 + correct = 0 + unequal_length = 0 + total = 0 + total_no_punct = 0 + gold_in_clause = 0 + gold_out_of_clause = 0 + e1_missed_root = 0 + e2_missed_root = 0 + e3_missed_root = 0 + e2_together_with_e3 = 0 + e2_without_e3 = 0 + error_locations = {'E1':[], 'E2':[], 'E3': []} + for idx, clause in enumerate(text.clauses): + wordforms = list(clause.gold.text) + gold_ids = list(clause.gold.id) + gold_heads = list(clause.gold.head) + gold_deprel = list(clause.gold.deprel) + gold_pos = list(clause.gold.xpostag) + gold_word_loc = list(clause.base_span) + assert len(gold_word_loc) == len(wordforms) + try: + parsed_heads = list(clause.parsed.head) + parsed_deprel = list(clause.parsed.deprel) + except AssertionError: + unequal_length += len(gold_heads) + total += len(gold_heads) + continue + in_clause_heads = gold_ids[:] + # Collect clause roots (words pointing outside the clause) + clause_roots = [] + for wid, head in zip(gold_ids, gold_heads): + if head not in gold_ids: + clause_roots.append(wid) + if not root_outside_clause: + # Count root node as being "inside + # the clause" rather than "outside + # the clause" (which is default). + # This shifts error distribution + # from E2 & E3 to E1. + in_clause_heads.append( 0 ) + # Collect specific errors inside clause + # (which can be correlated) + clause_e2 = [] + clause_e3 = [] + for gold_id, word_loc, wordform, pos, gold_head, parsed_head, gold_dep, parsed_dep in \ + zip(gold_ids, gold_word_loc, wordforms, gold_pos, gold_heads, parsed_heads, gold_deprel, parsed_deprel): + is_gold_root = (gold_head == 0) + (gold_start, gold_end) = word_loc.start, word_loc.end + total += 1 + if wordform in punct_tokens_set: + punct += 1 + pass + elif pos == 'Z': + punct += 1 + pass + else: + total_no_punct += 1 + if gold_head in in_clause_heads: + gold_in_clause += 1 + if gold_head == parsed_head and gold_dep == parsed_dep: + correct += 1 + else: + if parsed_head in in_clause_heads: + # local error: misplaced dependency inside the clause + e1 += 1 + if is_gold_root: + e1_missed_root += 1 + error_locations['E1'].append( (gold_start, gold_end) ) + else: + # overarcing error: + # misplaced dependency outside the clause, + # although it should be inside the clause + e2 += 1 + if is_gold_root: + e2_missed_root += 1 + if gold_head != parsed_head: + clause_e2.append(gold_id) + error_locations['E2'].append( (gold_start, gold_end) ) + else: + gold_out_of_clause += 1 + if gold_head == parsed_head and gold_dep == parsed_dep: + correct += 1 + else: + # global error: misplaced dependency which should be + # outside the clause (but was placed incorrectly + # inside or outside the clause) + e3 += 1 + if is_gold_root: + e3_missed_root += 1 + if gold_head != parsed_head and gold_id in clause_roots: + clause_e3.append(gold_id) + error_locations['E3'].append( (gold_start, gold_end) ) + if len(clause_e2) > 0: + if len(clause_e3) > 0: + # At least some of the clause roots were wrong and E2 errors appeared + e2_together_with_e3 += len( clause_e2 ) + elif len(clause_e3) == 0: + # Clause roots were correct, but E2 errors still appeared + # (parser pointed accidentially outside of the clause) + e2_without_e3 += len( clause_e2 ) + if error_sample_output_file is not None and isinstance(error_sample_size, int): + # Pick randomly a sample of errors and write into a file + extract_error_samples(text, error_locations, + error_sample_output_file, + n=error_sample_size, seed=1) + # Calculate impacts/relative errors and format results (if required) + result = {'E1': e1, 'E2': e2, 'E3': e3} + if add_impact: + result['E1_impact'] = e1/(total_no_punct-correct) + result['E2_impact'] = e2/(total_no_punct-correct) + result['E3_impact'] = e3/(total_no_punct-correct) + if add_rel_error: + result['E1_rel_error'] = e1/gold_in_clause + result['E2_rel_error'] = e2/gold_in_clause + result['E3_rel_error'] = e3/gold_out_of_clause + if add_counts: + result['total_no_punct'] = total_no_punct + result['correct'] = correct + result['gold_in_clause'] = gold_in_clause + result['gold_out_of_clause'] = gold_out_of_clause + result['total_words'] = total + result['punct'] = punct + result['unequal_length'] = unequal_length + result['E1_missed_root'] = e1_missed_root + result['E2_missed_root'] = e2_missed_root + result['E3_missed_root'] = e3_missed_root + # E2 in clause which has (at least one) E3 + result['E2_with_E3'] = e2_together_with_e3 + # E2 in clause which does not have any E3 + result['E2_without_E3'] = e2_without_e3 + if format_string is not None: + # Reformat impacts and relative errors + for k, v in result.items(): + if k.endswith(('_impact', '_rel_error')): + result[k] = ('{'+format_string+'}').format(v) + return result + + +def extract_error_samples(text, error_locations, output_file, n=100, seed=1, clauses_layer='clauses', + senteces_layer='sentences', gold_syntax_layer='gold', + auto_syntax_layer='parsed'): + ''' + Extracts erroneously parsed sentences based on collected E1, E2, E3 error_locations. + By default, extracts randomly n=100 samples of each error, or, if n >= len(errors), + extracts all erroneously parsed sentences from the error type. + Writes extracted sentences into output_file. + ''' + assert clauses_layer in text.layers + assert senteces_layer in text.layers + assert gold_syntax_layer in text.layers + assert auto_syntax_layer in text.layers + sent_locations = [(s.start, s.end, sid) for sid, s in enumerate(text[senteces_layer])] + clause_starts = set([cl.start for cl in text[clauses_layer]]) + clause_ends = set([cl.end for cl in text[clauses_layer]]) + # Clear output file + with open(output_file, 'w', encoding='utf-8') as out_f: + pass + rnd = random.Random(seed) + for err_key in error_locations.keys(): + err_sentences = [] + extracted_err_sents = [] + # Get all sentences containing errors + for (err_start, err_end) in error_locations[err_key]: + for (s_start, s_end, s_id) in sent_locations: + if s_start <= err_start and err_end <= s_end: + if (s_start, s_end, s_id) not in err_sentences: + err_sentences.append( (s_start, s_end, s_id) ) + break + if n < len(err_sentences): + # Make a random pick (if more than n sentences) + picked_sentences = rnd.sample(err_sentences, n) + else: + # Take all sentences with errors + picked_sentences = err_sentences + # Format sentences (mark errors) + for (s_start, s_end, s_id) in picked_sentences: + # Gather all errors inside the given sentence + sent_focus_errors = set() + sent_other_errors = dict() + for alt_err_key in error_locations.keys(): + for (err_start, err_end) in error_locations[alt_err_key]: + if s_start <= err_start and err_end <= s_end: + # Error inside the given sentence + if err_key == alt_err_key: + sent_focus_errors.add( (err_start, err_end) ) + else: + if (err_start, err_end) not in sent_other_errors: + sent_other_errors[(err_start, err_end)] = [] + sent_other_errors[(err_start, err_end)].append( alt_err_key ) + # Display words with labellings & errors + sent_formatted = [] + for word_span in text[senteces_layer][s_id]: + gold_word = text[gold_syntax_layer].get(word_span.base_span) + auto_word = text[auto_syntax_layer].get(word_span.base_span) + w_start, w_end = word_span.start, word_span.end + gold_id = gold_word.annotations[0]['id'] + gold_deprel = gold_word.annotations[0]['deprel'] + auto_deprel = auto_word.annotations[0]['deprel'] + gold_head = gold_word.annotations[0]['head'] + auto_head = auto_word.annotations[0]['head'] + word_text = word_span.text + error_type = '' + deprel_with_err = f'{gold_deprel}' + if (gold_deprel != auto_deprel): + deprel_with_err = f'{deprel_with_err} (auto: {auto_deprel})' + head_with_err = f'{gold_head}' + if (gold_head != auto_head): + head_with_err = f'{head_with_err} (auto: {auto_head})' + if (w_start, w_end) in sent_focus_errors: + error_type = f'((!{err_key}))' + elif (w_start, w_end) in sent_other_errors: + other_err_keys = [e.lower() for e in sent_other_errors[(w_start, w_end)]] + error_type = ';'.join(other_err_keys) + if word_span.start in clause_starts: + sent_formatted.append('CLAUSE_START') + sent_formatted.append( f'{gold_id} {word_text} {head_with_err} {deprel_with_err} {error_type}' ) + if word_span.end in clause_ends: + sent_formatted.append('CLAUSE_END') + sent_formatted.append('') + extracted_err_sents.append( '\n'.join(sent_formatted) ) + print(f'Writing {len(extracted_err_sents)} samples of {err_key} errors into {output_file} ...') + with open(output_file, 'a', encoding='utf-8') as out_f: + out_f.write('='*100) + out_f.write('\n') + out_f.write(f' {err_key}') + out_f.write('\n') + out_f.write('='*100) + out_f.write('\n') + for err_sent in extracted_err_sents: + out_f.write(err_sent) + out_f.write('\n') + + +# ======================================================================== + +if __name__ == '__main__': + # 1) Try to get configuration files from input args + # Optionally, user can also pass name of the output csv file + conf_files = [] + output_csv_file = 'results.csv' + for input_arg in sys.argv[1:]: + if (input_arg.lower()).endswith('.ini'): + conf_files.append( input_arg ) + elif (input_arg.lower()).endswith('.csv'): + output_csv_file = input_arg + if len(conf_files) == 0: + # 2) Try to collect configuration files from the root dir + root_dir = '.' + for fname in sorted( os.listdir(root_dir) ): + if (fname.lower()).endswith('.ini'): + # Attempt to open experiment configuration from INI file + conf_file = os.path.join(root_dir, fname) + conf_files.append(conf_file) + if len(conf_files) > 0: + # Perform evaluations defined in configurations + collected_results = dict() + for conf_file in conf_files: + eval_main(conf_file, collected_results=collected_results, ignore_missing=True, + verbose=True, round=True, count_words=False) + # Save collected results into experiment root directory csv file + for exp_root in collected_results.keys(): + filename = os.path.join(exp_root, output_csv_file) if os.path.exists(exp_root) else f'results_{exp_root}.csv' + print(f'Writing evaluation results into {filename} ...') + with open(filename, 'w', encoding='utf-8', newline='') as output_csv: + csv_writer = csv.writer(output_csv) + header = None + for exp_name in collected_results[exp_root].keys(): + exp_fields = list(collected_results[exp_root][exp_name].keys()) + if header is None: + header = ['experiment'] + exp_fields + csv_writer.writerow( header ) + else: + assert header[1:] == exp_fields, \ + f'Error writing CSV: mismatching headers: {header[1:]!r} vs {exp_fields!r}' + values = [exp_name] + for key in header[1:]: + values.append( collected_results[exp_root][exp_name][key] ) + assert len(values) == len(header) + csv_writer.writerow( values ) + else: + raise Exception('(!) Could not found any configuration INI file from the root directory nor from input arguments') \ No newline at end of file diff --git a/06_result_tables.ipynb b/06_result_tables.ipynb new file mode 100644 index 00000000..798e8581 --- /dev/null +++ b/06_result_tables.ipynb @@ -0,0 +1,2724 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0ed091b5", + "metadata": {}, + "source": [ + "## Inspecting result CSV files" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d31710fd", + "metadata": {}, + "outputs": [], + "source": [ + "import os, os.path\n", + "import re\n", + "\n", + "from pandas import DataFrame as df\n", + "from pandas import read_csv\n", + "import pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dcdbccd8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "results_crossvalidation.csv\n", + "results_ensemble_conf_intervals.csv\n", + "results_ensemble_entropy.csv\n", + "results_ensemble_majority_voting.csv\n", + "results_full_data_malt_udpipe1.csv\n", + "results_gap_experiments.csv\n", + "results_gold_and_auto_ud_morph.csv\n", + "results_half_data.csv\n", + "results_smaller_data.csv\n", + "results_stanza_basic.csv\n", + "results_stanza_MA_ensembles.csv\n", + "results_stanza_ME_conf_intervals.csv\n", + "results_stanza_ME_error_types.csv\n", + "results_stanza_ME_full_predict_on_clauses.csv\n", + "results_stanza_ME_full_predict_on_clauses_error_types.csv\n", + "results_stanza_ME_on_clauses.csv\n", + "results_stanza_ME_on_clauses_error_types.csv\n", + "results_stanza_ME_sketches_5groups_knockout.csv\n", + "results_stanza_ME_sketches_5groups_knockout_matrix.csv\n", + "results_stanza_ME_sketches_5randomgroups_knockout.csv\n", + "results_stanza_ME_sketches_5randomgroups_knockout_matrix.csv\n", + "results_stanza_UD_on_auto_UD_morph.csv\n" + ] + } + ], + "source": [ + "input_dir = 'edt_2.6' # Experiments done on Estonian Dependency Treebank version 2.6\n", + "\n", + "for fname in os.listdir(input_dir):\n", + " if fname.startswith('results_') and fname.endswith('.csv'):\n", + " print(fname)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "52ffdbec", + "metadata": {}, + "outputs": [], + "source": [ + "pandas.set_option('display.max_colwidth', None)" + ] + }, + { + "cell_type": "markdown", + "id": "fe406614", + "metadata": {}, + "source": [ + "### MaltParser and UDPipe-1" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8a97e325", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainLAS_gapUAS_testUAS_train
0eval_malt_morph_extended_full0.71860.84460.12600.76810.8715
1eval_udpipe1_default_morph_extended0.75600.85010.09410.80090.8811
2eval_udpipe1_embeddings_morph_extended0.77130.91470.14340.81350.9330
\n", + "
" + ], + "text/plain": [ + " experiment LAS_test LAS_train LAS_gap \\\n", + "0 eval_malt_morph_extended_full 0.7186 0.8446 0.1260 \n", + "1 eval_udpipe1_default_morph_extended 0.7560 0.8501 0.0941 \n", + "2 eval_udpipe1_embeddings_morph_extended 0.7713 0.9147 0.1434 \n", + "\n", + " UAS_test UAS_train \n", + "0 0.7681 0.8715 \n", + "1 0.8009 0.8811 \n", + "2 0.8135 0.9330 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# MaltParser and UDPipe-1 results\n", + "read_csv('edt_2.6/results_full_data_malt_udpipe1.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "a18f8d5f", + "metadata": {}, + "source": [ + "### Basic stanza on Vabamorf's annotations (morph_analysis, morph_extended)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d62afeb0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainLAS_gapUAS_testUAS_traintest_wordstrain_words
0eval_stanza_morph_analysis_full_data0.85070.92310.07240.88020.942248491389278
1eval_stanza_morph_extended_full_data0.84860.91760.06890.87820.937848491389278
\n", + "
" + ], + "text/plain": [ + " experiment LAS_test LAS_train LAS_gap \\\n", + "0 eval_stanza_morph_analysis_full_data 0.8507 0.9231 0.0724 \n", + "1 eval_stanza_morph_extended_full_data 0.8486 0.9176 0.0689 \n", + "\n", + " UAS_test UAS_train test_words train_words \n", + "0 0.8802 0.9422 48491 389278 \n", + "1 0.8782 0.9378 48491 389278 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Basic Stanza parser results\n", + "read_csv('edt_2.6/results_stanza_basic.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d93d2d8e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainUAS_testUAS_trainLAS_gaptest_wordstrain_words
0eval_stanza_morph_extended_full_data_conf_intervals0.8401; 0.8486; 0.85720.9161; 0.9176; 0.91910.8697; 0.8782; 0.88670.9363; 0.9378; 0.93930.0760; 0.0689; 0.061948491389278
\n", + "
" + ], + "text/plain": [ + " experiment \\\n", + "0 eval_stanza_morph_extended_full_data_conf_intervals \n", + "\n", + " LAS_test LAS_train UAS_test \\\n", + "0 0.8401; 0.8486; 0.8572 0.9161; 0.9176; 0.9191 0.8697; 0.8782; 0.8867 \n", + "\n", + " UAS_train LAS_gap test_words train_words \n", + "0 0.9363; 0.9378; 0.9393 0.0760; 0.0689; 0.0619 48491 389278 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Basic Stanza ME parser with 95% conf intervals\n", + "read_csv('edt_2.6/results_stanza_ME_conf_intervals.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "15aa1546", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentE1E2E3E1_impactE2_impactE3_impactE1_rel_errorE2_rel_errorE3_rel_error...gold_in_clausegold_out_of_clausetotal_wordspunctunequal_lengthE1_missed_rootE2_missed_rootE3_missed_rootE2_with_E3E2_without_E3
0stanza_ME_error_types_on_test482139811620.75550.06240.18210.14130.01170.1733...34108670748491767600025436038
\n", + "

1 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " experiment E1 E2 E3 E1_impact E2_impact \\\n", + "0 stanza_ME_error_types_on_test 4821 398 1162 0.7555 0.0624 \n", + "\n", + " E3_impact E1_rel_error E2_rel_error E3_rel_error ... gold_in_clause \\\n", + "0 0.1821 0.1413 0.0117 0.1733 ... 34108 \n", + "\n", + " gold_out_of_clause total_words punct unequal_length E1_missed_root \\\n", + "0 6707 48491 7676 0 0 \n", + "\n", + " E2_missed_root E3_missed_root E2_with_E3 E2_without_E3 \n", + "0 0 254 360 38 \n", + "\n", + "[1 rows x 22 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Stanza parser error types on test set\n", + "read_csv('edt_2.6/results_stanza_ME_error_types.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "dc0a2868", + "metadata": {}, + "source": [ + "### Stanza with crossvalidation" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d3c857dd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainLAS_gapUAS_testUAS_train
0eval_stanza_morph_extended_crossvalidation_010.84850.92920.08070.87780.9476
1eval_stanza_morph_extended_crossvalidation_020.84800.92170.07370.87670.9412
2eval_stanza_morph_extended_crossvalidation_030.84600.92740.08130.87600.9463
3eval_stanza_morph_extended_crossvalidation_040.84680.92630.07950.87630.9452
4eval_stanza_morph_extended_crossvalidation_050.85140.93080.07930.87970.9484
5eval_stanza_morph_extended_crossvalidation_060.84730.92030.07300.87720.9405
6eval_stanza_morph_extended_crossvalidation_070.84650.92010.07350.87590.9402
7eval_stanza_morph_extended_crossvalidation_080.84820.92970.08150.87790.9479
8eval_stanza_morph_extended_crossvalidation_090.84820.92410.07580.87820.9430
9eval_stanza_morph_extended_crossvalidation_100.85030.93220.08190.88000.9499
10eval_stanza_morph_extended_crossvalidation_AVG0.84810.92620.07800.87760.9450
11eval_stanza_ensemble_tagger_morph_extended_default_full_data0.85680.93370.07690.88510.9515
\n", + "
" + ], + "text/plain": [ + " experiment LAS_test \\\n", + "0 eval_stanza_morph_extended_crossvalidation_01 0.8485 \n", + "1 eval_stanza_morph_extended_crossvalidation_02 0.8480 \n", + "2 eval_stanza_morph_extended_crossvalidation_03 0.8460 \n", + "3 eval_stanza_morph_extended_crossvalidation_04 0.8468 \n", + "4 eval_stanza_morph_extended_crossvalidation_05 0.8514 \n", + "5 eval_stanza_morph_extended_crossvalidation_06 0.8473 \n", + "6 eval_stanza_morph_extended_crossvalidation_07 0.8465 \n", + "7 eval_stanza_morph_extended_crossvalidation_08 0.8482 \n", + "8 eval_stanza_morph_extended_crossvalidation_09 0.8482 \n", + "9 eval_stanza_morph_extended_crossvalidation_10 0.8503 \n", + "10 eval_stanza_morph_extended_crossvalidation_AVG 0.8481 \n", + "11 eval_stanza_ensemble_tagger_morph_extended_default_full_data 0.8568 \n", + "\n", + " LAS_train LAS_gap UAS_test UAS_train \n", + "0 0.9292 0.0807 0.8778 0.9476 \n", + "1 0.9217 0.0737 0.8767 0.9412 \n", + "2 0.9274 0.0813 0.8760 0.9463 \n", + "3 0.9263 0.0795 0.8763 0.9452 \n", + "4 0.9308 0.0793 0.8797 0.9484 \n", + "5 0.9203 0.0730 0.8772 0.9405 \n", + "6 0.9201 0.0735 0.8759 0.9402 \n", + "7 0.9297 0.0815 0.8779 0.9479 \n", + "8 0.9241 0.0758 0.8782 0.9430 \n", + "9 0.9322 0.0819 0.8800 0.9499 \n", + "10 0.9262 0.0780 0.8776 0.9450 \n", + "11 0.9337 0.0769 0.8851 0.9515 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Stanza parser crossvalidation\n", + "read_csv('edt_2.6/results_crossvalidation.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "42d78955", + "metadata": {}, + "source": [ + "### Stanza ablation experiments" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "11e86a5e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainLAS_gapUAS_testUAS_train
0eval_stanza_morph_extended_gap_experiments_01_no_wordforms0.83960.92360.08400.87340.9461
1eval_stanza_morph_extended_gap_experiments_02_no_lemmas0.84830.93080.08250.87890.9496
2eval_stanza_morph_extended_gap_experiments_02_no_pos0.84920.92840.07920.87810.9471
3eval_stanza_morph_extended_gap_experiments_03_no_wordforms_adj_noun_lemmas0.82000.91090.09090.86100.9396
4eval_stanza_morph_extended_gap_experiments_04_no_wordforms_verb_adpos_lemmas0.78220.91060.12850.81810.9370
5eval_stanza_morph_extended_gap_experiments_05_only_cg_list_wordforms_lemmas0.81580.91530.09950.85720.9433
6eval_stanza_morph_extended_gap_experiments_06_no_wordform_lemma_pos_keep_conj0.68690.81000.12300.75720.8701
7eval_stanza_morph_extended_gap_experiments_07_no_wordform_lemma_pos0.67230.76480.09250.75320.8351
8eval_stanza_morph_extended_gap_experiments_08_only_wordforms0.84070.92020.07950.87420.9391
9eval_stanza_morph_extended_gap_experiments_09_only_pos_feats0.70180.82130.11950.77090.8799
\n", + "
" + ], + "text/plain": [ + " experiment \\\n", + "0 eval_stanza_morph_extended_gap_experiments_01_no_wordforms \n", + "1 eval_stanza_morph_extended_gap_experiments_02_no_lemmas \n", + "2 eval_stanza_morph_extended_gap_experiments_02_no_pos \n", + "3 eval_stanza_morph_extended_gap_experiments_03_no_wordforms_adj_noun_lemmas \n", + "4 eval_stanza_morph_extended_gap_experiments_04_no_wordforms_verb_adpos_lemmas \n", + "5 eval_stanza_morph_extended_gap_experiments_05_only_cg_list_wordforms_lemmas \n", + "6 eval_stanza_morph_extended_gap_experiments_06_no_wordform_lemma_pos_keep_conj \n", + "7 eval_stanza_morph_extended_gap_experiments_07_no_wordform_lemma_pos \n", + "8 eval_stanza_morph_extended_gap_experiments_08_only_wordforms \n", + "9 eval_stanza_morph_extended_gap_experiments_09_only_pos_feats \n", + "\n", + " LAS_test LAS_train LAS_gap UAS_test UAS_train \n", + "0 0.8396 0.9236 0.0840 0.8734 0.9461 \n", + "1 0.8483 0.9308 0.0825 0.8789 0.9496 \n", + "2 0.8492 0.9284 0.0792 0.8781 0.9471 \n", + "3 0.8200 0.9109 0.0909 0.8610 0.9396 \n", + "4 0.7822 0.9106 0.1285 0.8181 0.9370 \n", + "5 0.8158 0.9153 0.0995 0.8572 0.9433 \n", + "6 0.6869 0.8100 0.1230 0.7572 0.8701 \n", + "7 0.6723 0.7648 0.0925 0.7532 0.8351 \n", + "8 0.8407 0.9202 0.0795 0.8742 0.9391 \n", + "9 0.7018 0.8213 0.1195 0.7709 0.8799 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Stanza parser gap experiments, trained on full data, results\n", + "read_csv('edt_2.6/results_gap_experiments.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "15db66c1", + "metadata": {}, + "source": [ + "### Stanza half training data experiments" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6c75808d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainLAS_gapUAS_testUAS_train
0eval_stanza_morph_extended_half_data_0010.82950.94480.11530.86250.9564
1eval_stanza_morph_extended_half_data_0020.82780.92030.09250.86070.9392
2eval_stanza_morph_extended_half_data_0030.82760.94530.11770.86020.9568
3eval_stanza_morph_extended_half_data_0040.82650.93840.11190.85800.9521
4eval_stanza_morph_extended_half_data_0050.82640.92890.10240.86010.9443
5eval_stanza_morph_extended_half_data_0060.82480.92550.10070.85860.9425
6eval_stanza_morph_extended_half_data_0070.82750.92710.09960.86020.9438
7eval_stanza_morph_extended_half_data_0080.82870.93530.10650.86170.9495
8eval_stanza_morph_extended_half_data_0090.83040.93680.10650.86270.9504
9eval_stanza_morph_extended_half_data_0100.82520.92750.10230.85790.9445
10eval_stanza_morph_extended_half_data_AVG0.82740.93300.10550.86030.9480
11eval_stanza_morph_extended_half_data_ensemble0.84460.91510.07050.87600.9344
\n", + "
" + ], + "text/plain": [ + " experiment LAS_test LAS_train \\\n", + "0 eval_stanza_morph_extended_half_data_001 0.8295 0.9448 \n", + "1 eval_stanza_morph_extended_half_data_002 0.8278 0.9203 \n", + "2 eval_stanza_morph_extended_half_data_003 0.8276 0.9453 \n", + "3 eval_stanza_morph_extended_half_data_004 0.8265 0.9384 \n", + "4 eval_stanza_morph_extended_half_data_005 0.8264 0.9289 \n", + "5 eval_stanza_morph_extended_half_data_006 0.8248 0.9255 \n", + "6 eval_stanza_morph_extended_half_data_007 0.8275 0.9271 \n", + "7 eval_stanza_morph_extended_half_data_008 0.8287 0.9353 \n", + "8 eval_stanza_morph_extended_half_data_009 0.8304 0.9368 \n", + "9 eval_stanza_morph_extended_half_data_010 0.8252 0.9275 \n", + "10 eval_stanza_morph_extended_half_data_AVG 0.8274 0.9330 \n", + "11 eval_stanza_morph_extended_half_data_ensemble 0.8446 0.9151 \n", + "\n", + " LAS_gap UAS_test UAS_train \n", + "0 0.1153 0.8625 0.9564 \n", + "1 0.0925 0.8607 0.9392 \n", + "2 0.1177 0.8602 0.9568 \n", + "3 0.1119 0.8580 0.9521 \n", + "4 0.1024 0.8601 0.9443 \n", + "5 0.1007 0.8586 0.9425 \n", + "6 0.0996 0.8602 0.9438 \n", + "7 0.1065 0.8617 0.9495 \n", + "8 0.1065 0.8627 0.9504 \n", + "9 0.1023 0.8579 0.9445 \n", + "10 0.1055 0.8603 0.9480 \n", + "11 0.0705 0.8760 0.9344 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Stanza parser trained on half-data (i.e training data halved)\n", + "read_csv('edt_2.6/results_half_data.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "c4821788", + "metadata": {}, + "source": [ + "### Stanza increasing training data experiments" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "95e64940", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainLAS_gapUAS_testUAS_traintest_wordstrain_words
0eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0100.68640.94910.26280.76050.95914849139886
1eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0200.74900.96360.21460.80040.97004849177759
2eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0300.78190.93650.15460.82710.948948491117642
3eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0400.79890.93660.13770.83880.950248491157491
4eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0500.81520.92340.10820.85430.940648491195477
5eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0600.82500.92990.10490.85950.945848491235468
6eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0700.82410.91790.09380.86000.937048491273422
7eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0800.83270.91940.08670.86740.938348491311383
8eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0900.83430.91510.08080.86800.936248491349322
9eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_1000.83770.91040.07280.87230.932048491389278
10eval_stanza_morph_extended_smaller_data_02_keep_all_0100.75090.92860.17780.79950.94024849139886
11eval_stanza_morph_extended_smaller_data_02_keep_all_0200.78950.95520.16570.82970.96304849177759
12eval_stanza_morph_extended_smaller_data_02_keep_all_0300.80620.92680.12070.84390.944048491117642
13eval_stanza_morph_extended_smaller_data_02_keep_all_0400.82090.94140.12050.85450.954248491157491
14eval_stanza_morph_extended_smaller_data_02_keep_all_0500.82990.93600.10600.86160.950548491195477
15eval_stanza_morph_extended_smaller_data_02_keep_all_0600.84090.93500.09410.87230.950548491235468
16eval_stanza_morph_extended_smaller_data_02_keep_all_0700.83630.92480.08840.86750.942848491273422
17eval_stanza_morph_extended_smaller_data_02_keep_all_0800.84350.92950.08600.87390.946348491311383
18eval_stanza_morph_extended_smaller_data_02_keep_all_0900.84630.92610.07980.87560.943748491349322
19eval_stanza_morph_extended_smaller_data_02_keep_all_1000.84660.92360.07710.87630.942248491389278
20eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0100.62850.81740.18890.71210.85684849139886
21eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0200.65440.85420.19970.73360.89124849177759
22eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0300.66520.78470.11950.74120.844548491117642
23eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0400.67300.78890.11590.74790.847148491157491
24eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0500.68020.78780.10760.75360.847948491195477
25eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0600.68800.81240.12440.75950.866848491235468
26eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0700.69150.80000.10850.76380.859448491273422
27eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0800.69390.80010.10610.76530.857548491311383
28eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0900.69520.80810.11280.76580.866448491349322
29eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_1000.69700.78740.09040.76700.848948491389278
\n", + "
" + ], + "text/plain": [ + " experiment \\\n", + "0 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_010 \n", + "1 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_020 \n", + "2 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_030 \n", + "3 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_040 \n", + "4 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_050 \n", + "5 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_060 \n", + "6 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_070 \n", + "7 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_080 \n", + "8 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_090 \n", + "9 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_100 \n", + "10 eval_stanza_morph_extended_smaller_data_02_keep_all_010 \n", + "11 eval_stanza_morph_extended_smaller_data_02_keep_all_020 \n", + "12 eval_stanza_morph_extended_smaller_data_02_keep_all_030 \n", + "13 eval_stanza_morph_extended_smaller_data_02_keep_all_040 \n", + "14 eval_stanza_morph_extended_smaller_data_02_keep_all_050 \n", + "15 eval_stanza_morph_extended_smaller_data_02_keep_all_060 \n", + "16 eval_stanza_morph_extended_smaller_data_02_keep_all_070 \n", + "17 eval_stanza_morph_extended_smaller_data_02_keep_all_080 \n", + "18 eval_stanza_morph_extended_smaller_data_02_keep_all_090 \n", + "19 eval_stanza_morph_extended_smaller_data_02_keep_all_100 \n", + "20 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_010 \n", + "21 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_020 \n", + "22 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_030 \n", + "23 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_040 \n", + "24 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_050 \n", + "25 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_060 \n", + "26 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_070 \n", + "27 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_080 \n", + "28 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_090 \n", + "29 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_100 \n", + "\n", + " LAS_test LAS_train LAS_gap UAS_test UAS_train test_words train_words \n", + "0 0.6864 0.9491 0.2628 0.7605 0.9591 48491 39886 \n", + "1 0.7490 0.9636 0.2146 0.8004 0.9700 48491 77759 \n", + "2 0.7819 0.9365 0.1546 0.8271 0.9489 48491 117642 \n", + "3 0.7989 0.9366 0.1377 0.8388 0.9502 48491 157491 \n", + "4 0.8152 0.9234 0.1082 0.8543 0.9406 48491 195477 \n", + "5 0.8250 0.9299 0.1049 0.8595 0.9458 48491 235468 \n", + "6 0.8241 0.9179 0.0938 0.8600 0.9370 48491 273422 \n", + "7 0.8327 0.9194 0.0867 0.8674 0.9383 48491 311383 \n", + "8 0.8343 0.9151 0.0808 0.8680 0.9362 48491 349322 \n", + "9 0.8377 0.9104 0.0728 0.8723 0.9320 48491 389278 \n", + "10 0.7509 0.9286 0.1778 0.7995 0.9402 48491 39886 \n", + "11 0.7895 0.9552 0.1657 0.8297 0.9630 48491 77759 \n", + "12 0.8062 0.9268 0.1207 0.8439 0.9440 48491 117642 \n", + "13 0.8209 0.9414 0.1205 0.8545 0.9542 48491 157491 \n", + "14 0.8299 0.9360 0.1060 0.8616 0.9505 48491 195477 \n", + "15 0.8409 0.9350 0.0941 0.8723 0.9505 48491 235468 \n", + "16 0.8363 0.9248 0.0884 0.8675 0.9428 48491 273422 \n", + "17 0.8435 0.9295 0.0860 0.8739 0.9463 48491 311383 \n", + "18 0.8463 0.9261 0.0798 0.8756 0.9437 48491 349322 \n", + "19 0.8466 0.9236 0.0771 0.8763 0.9422 48491 389278 \n", + "20 0.6285 0.8174 0.1889 0.7121 0.8568 48491 39886 \n", + "21 0.6544 0.8542 0.1997 0.7336 0.8912 48491 77759 \n", + "22 0.6652 0.7847 0.1195 0.7412 0.8445 48491 117642 \n", + "23 0.6730 0.7889 0.1159 0.7479 0.8471 48491 157491 \n", + "24 0.6802 0.7878 0.1076 0.7536 0.8479 48491 195477 \n", + "25 0.6880 0.8124 0.1244 0.7595 0.8668 48491 235468 \n", + "26 0.6915 0.8000 0.1085 0.7638 0.8594 48491 273422 \n", + "27 0.6939 0.8001 0.1061 0.7653 0.8575 48491 311383 \n", + "28 0.6952 0.8081 0.1128 0.7658 0.8664 48491 349322 \n", + "29 0.6970 0.7874 0.0904 0.7670 0.8489 48491 389278 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Stanza parser trained while incrementally increasing training set sizes\n", + "# 01_08_only_wordforms -- trained on form (text); deleted: lemma, upos, xpos, feats;\n", + "# 02_keep_all -- trained on all fields: form (text), lemma, upos, xpos, feats;\n", + "# 03_only_pos_feats_09_only_pos_feats -- trained on upos, xpos, feats; deleted: form(text), lemma;\n", + "read_csv('edt_2.6/results_smaller_data.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "6cabedef", + "metadata": {}, + "source": [ + "### Ensemble of Stanza ME models (full data, half data, conf intervals)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b00f74b2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainUAS_testUAS_trainLAS_gap
0eval_stanza_morph_extended_full_data_conf_intervals0.8401; 0.8486; 0.85720.9161; 0.9176; 0.91910.8697; 0.8782; 0.88670.9363; 0.9378; 0.93930.0760; 0.0689; 0.0619
1eval_stanza_ensemble_morph_extended_full_data_las_coherence_conf_intervals0.8488; 0.8568; 0.86490.9327; 0.9337; 0.93470.8774; 0.8852; 0.89290.9505; 0.9515; 0.95250.0839; 0.0769; 0.0698
2eval_stanza_ensemble_morph_extended_full_data_majority_voting_conf_intervals0.8487; 0.8564; 0.86410.9327; 0.9337; 0.93470.8775; 0.8849; 0.89240.9507; 0.9517; 0.95270.0840; 0.0773; 0.0707
3eval_stanza_ensemble_morph_extended_half_data_las_coherence_conf_intervals0.8383; 0.8447; 0.85100.9134; 0.9152; 0.91690.8699; 0.8761; 0.88220.9325; 0.9344; 0.93630.0751; 0.0705; 0.0659
4eval_stanza_ensemble_morph_extended_half_data_majority_voting_conf_intervals0.8386; 0.8449; 0.85120.9109; 0.9126; 0.91430.8699; 0.8762; 0.88260.9306; 0.9325; 0.93440.0722; 0.0677; 0.0631
\n", + "
" + ], + "text/plain": [ + " experiment \\\n", + "0 eval_stanza_morph_extended_full_data_conf_intervals \n", + "1 eval_stanza_ensemble_morph_extended_full_data_las_coherence_conf_intervals \n", + "2 eval_stanza_ensemble_morph_extended_full_data_majority_voting_conf_intervals \n", + "3 eval_stanza_ensemble_morph_extended_half_data_las_coherence_conf_intervals \n", + "4 eval_stanza_ensemble_morph_extended_half_data_majority_voting_conf_intervals \n", + "\n", + " LAS_test LAS_train UAS_test \\\n", + "0 0.8401; 0.8486; 0.8572 0.9161; 0.9176; 0.9191 0.8697; 0.8782; 0.8867 \n", + "1 0.8488; 0.8568; 0.8649 0.9327; 0.9337; 0.9347 0.8774; 0.8852; 0.8929 \n", + "2 0.8487; 0.8564; 0.8641 0.9327; 0.9337; 0.9347 0.8775; 0.8849; 0.8924 \n", + "3 0.8383; 0.8447; 0.8510 0.9134; 0.9152; 0.9169 0.8699; 0.8761; 0.8822 \n", + "4 0.8386; 0.8449; 0.8512 0.9109; 0.9126; 0.9143 0.8699; 0.8762; 0.8826 \n", + "\n", + " UAS_train LAS_gap \n", + "0 0.9363; 0.9378; 0.9393 0.0760; 0.0689; 0.0619 \n", + "1 0.9505; 0.9515; 0.9525 0.0839; 0.0769; 0.0698 \n", + "2 0.9507; 0.9517; 0.9527 0.0840; 0.0773; 0.0707 \n", + "3 0.9325; 0.9344; 0.9363 0.0751; 0.0705; 0.0659 \n", + "4 0.9306; 0.9325; 0.9344 0.0722; 0.0677; 0.0631 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Ensemble results with 95% confidence intervals (normal distribution):\n", + "# 0 -- single model (baseline)\n", + "# 1-2 -- full data ensemble models\n", + "# 3-4 -- half data ensemble models\n", + "ensemble_conf_intervals = read_csv('edt_2.6/results_ensemble_conf_intervals.csv')\n", + "ensemble_conf_intervals" + ] + }, + { + "cell_type": "markdown", + "id": "f1e29bbc", + "metadata": {}, + "source": [ + "### Ensemble of Stanza MA models (full data)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "a71530db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainUAS_testUAS_trainLAS_gap
0eval_stanza_ensemble_morph_analysis_full_data_default0.85730.93300.88510.9510.0757
1eval_stanza_ensemble_morph_analysis_full_data_majority_voting0.85730.93290.88580.9510.0756
\n", + "
" + ], + "text/plain": [ + " experiment LAS_test \\\n", + "0 eval_stanza_ensemble_morph_analysis_full_data_default 0.8573 \n", + "1 eval_stanza_ensemble_morph_analysis_full_data_majority_voting 0.8573 \n", + "\n", + " LAS_train UAS_test UAS_train LAS_gap \n", + "0 0.9330 0.8851 0.951 0.0757 \n", + "1 0.9329 0.8858 0.951 0.0756 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ensemble_morph_analysis = read_csv('edt_2.6/results_stanza_MA_ensembles.csv')\n", + "ensemble_morph_analysis" + ] + }, + { + "cell_type": "markdown", + "id": "de31ccc1", + "metadata": {}, + "source": [ + "### Stanza on Stanza's default UD morph (automatically created)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6ff4d564", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainUAS_testUAS_trainLAS_gap
0eval_stanza_ud_on_stanza_auto_morph_full_data0.85190.92910.88010.94670.0772
\n", + "
" + ], + "text/plain": [ + " experiment LAS_test LAS_train \\\n", + "0 eval_stanza_ud_on_stanza_auto_morph_full_data 0.8519 0.9291 \n", + "\n", + " UAS_test UAS_train LAS_gap \n", + "0 0.8801 0.9467 0.0772 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Stanza parser trained on stanza's UD morph annotations:\n", + "read_csv('edt_2.6/results_stanza_UD_on_auto_UD_morph.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "3bb1fe46", + "metadata": {}, + "source": [ + "### Stanza on UD morph analysis (auto, gold)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e0de7753", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainLAS_gapUAS_testUAS_train
0eval_stanza_ud_auto_morph_full_data0.85230.93330.0810.88080.9508
1eval_stanza_ud_gold_morph_full_data0.88100.93800.0570.89870.9502
\n", + "
" + ], + "text/plain": [ + " experiment LAS_test LAS_train LAS_gap \\\n", + "0 eval_stanza_ud_auto_morph_full_data 0.8523 0.9333 0.081 \n", + "1 eval_stanza_ud_gold_morph_full_data 0.8810 0.9380 0.057 \n", + "\n", + " UAS_test UAS_train \n", + "0 0.8808 0.9508 \n", + "1 0.8987 0.9502 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Stanza parser trained on UD annotations:\n", + "# ud_auto -- UD annotations automatically converted from Vabamorf's morph_analysis layer;\n", + "# ud_gold -- original gold standard UD annotations from the EDT corpus;\n", + "read_csv('edt_2.6/results_gold_and_auto_ud_morph.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "35944100", + "metadata": {}, + "source": [ + "### Stanza basic on the latest version of the corpus (2.11)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0a34ba81", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainLAS_gapUAS_testUAS_train
0eval_stanza_morph_extended_full_data0.84840.92470.07640.87750.943
\n", + "
" + ], + "text/plain": [ + " experiment LAS_test LAS_train LAS_gap \\\n", + "0 eval_stanza_morph_extended_full_data 0.8484 0.9247 0.0764 \n", + "\n", + " UAS_test UAS_train \n", + "0 0.8775 0.943 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Stanza trained and evaluated on the latest version of the corpus (EDT 2.11)\n", + "read_csv('edt_2.11/results_stanza_basic.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "16cb46f8", + "metadata": {}, + "source": [ + "### Composite table (Table 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f826a55d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainLAS_gapUAS_testUAS_train
0eval_malt_morph_extended_full0.71860.84460.12600.76810.8715
1eval_udpipe1_default_morph_extended0.75600.85010.09410.80090.8811
2eval_udpipe1_embeddings_morph_extended0.77130.91470.14340.81350.9330
0eval_stanza_ud_on_stanza_auto_morph_full_data0.85190.92910.07720.88010.9467
8eval_stanza_morph_extended_gap_experiments_08_only_wordforms0.84070.92020.07950.87420.9391
0eval_stanza_morph_analysis_full_data0.85070.92310.07240.88020.9422
1eval_stanza_morph_extended_full_data (EDT_2.6)0.84860.91760.06890.87820.9378
0eval_stanza_morph_extended_full_data (EDT_2.11)0.84840.92470.07640.87750.9430
0eval_stanza_morph_extended_full_data_ensemble_majority_voting0.85640.93370.07730.88490.9517
0eval_stanza_ensemble_morph_analysis_full_data_default0.85730.93300.07570.88510.9510
1eval_stanza_ud_gold_morph_full_data0.88100.93800.05700.89870.9502
\n", + "
" + ], + "text/plain": [ + " experiment LAS_test \\\n", + "0 eval_malt_morph_extended_full 0.7186 \n", + "1 eval_udpipe1_default_morph_extended 0.7560 \n", + "2 eval_udpipe1_embeddings_morph_extended 0.7713 \n", + "0 eval_stanza_ud_on_stanza_auto_morph_full_data 0.8519 \n", + "8 eval_stanza_morph_extended_gap_experiments_08_only_wordforms 0.8407 \n", + "0 eval_stanza_morph_analysis_full_data 0.8507 \n", + "1 eval_stanza_morph_extended_full_data (EDT_2.6) 0.8486 \n", + "0 eval_stanza_morph_extended_full_data (EDT_2.11) 0.8484 \n", + "0 eval_stanza_morph_extended_full_data_ensemble_majority_voting 0.8564 \n", + "0 eval_stanza_ensemble_morph_analysis_full_data_default 0.8573 \n", + "1 eval_stanza_ud_gold_morph_full_data 0.8810 \n", + "\n", + " LAS_train LAS_gap UAS_test UAS_train \n", + "0 0.8446 0.1260 0.7681 0.8715 \n", + "1 0.8501 0.0941 0.8009 0.8811 \n", + "2 0.9147 0.1434 0.8135 0.9330 \n", + "0 0.9291 0.0772 0.8801 0.9467 \n", + "8 0.9202 0.0795 0.8742 0.9391 \n", + "0 0.9231 0.0724 0.8802 0.9422 \n", + "1 0.9176 0.0689 0.8782 0.9378 \n", + "0 0.9247 0.0764 0.8775 0.9430 \n", + "0 0.9337 0.0773 0.8849 0.9517 \n", + "0 0.9330 0.0757 0.8851 0.9510 \n", + "1 0.9380 0.0570 0.8987 0.9502 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# MaltParser and UDPipe-1 results\n", + "res1 = read_csv('edt_2.6/results_full_data_malt_udpipe1.csv')\n", + "# Stanza parser trained on stanza's UD morph annotations:\n", + "res2x = read_csv('edt_2.6/results_stanza_UD_on_auto_UD_morph.csv')\n", + "# Stanza only_wordforms experiment\n", + "res2 = read_csv('edt_2.6/results_gap_experiments.csv')\n", + "res2 = res2.loc[res2['experiment'].str.contains('08_only_wordforms')]\n", + "# Basic Stanza MA and ME parser results on EDT 2.6\n", + "res3 = read_csv('edt_2.6/results_stanza_basic.csv')\n", + "# Rename models\n", + "res3.loc[:, 'experiment'] = res3.apply(lambda x: re.sub(r'eval_stanza_morph_extended_full_data',\\\n", + " 'eval_stanza_morph_extended_full_data (EDT_2.6)',\n", + " x['experiment']), axis=1)\n", + "# Stanza trained and evaluated EDT 2.11\n", + "res4 = read_csv('edt_2.11/results_stanza_basic.csv')\n", + "res4.loc[:, 'experiment'] = res4.apply(lambda x: re.sub(r'eval_stanza_morph_extended_full_data',\\\n", + " 'eval_stanza_morph_extended_full_data (EDT_2.11)',\n", + " x['experiment']), axis=1)\n", + "# ME ensemble results with aggregation_algorithm = 'majority_voting'\n", + "res5 = read_csv('edt_2.6/results_ensemble_majority_voting.csv')\n", + "res5 = res5.loc[res5['experiment'].str.contains('full_data_ensemble_majority_voting')]\n", + "# MA ensemble results with default aggregation_algorithm\n", + "res6 = read_csv('edt_2.6/results_stanza_MA_ensembles.csv')\n", + "res6 = res6.loc[res6['experiment'].str.contains('eval_stanza_ensemble_morph_analysis_full_data_default')]\n", + "# Results on UD gold\n", + "res7 = read_csv('edt_2.6/results_gold_and_auto_ud_morph.csv')\n", + "res7 = res7.loc[res7['experiment'].str.contains('ud_gold')]\n", + "composite_res = pandas.concat([res1, res2x, res2, res3, res4, res5, res6, res7])\n", + "composite_res = composite_res.drop(['test_words', 'train_words'], axis=1)\n", + "composite_res" + ] + }, + { + "cell_type": "markdown", + "id": "44cac810", + "metadata": {}, + "source": [ + "### Hoeffding's bounds" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "e61a80a3", + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "\n", + "# Calculates Hoeffding's bound. \n", + "# Parameters:\n", + "# n -- evaluation set size in words\n", + "# alpha -- confidence level is (1 - alpha)\n", + "def hoeffding_bounds(n, alpha):\n", + " return math.sqrt( (1/(2*n)*math.log(2/(alpha))) )" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "27062c90", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.006167393570927968" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# bounds for test set\n", + "hfb_test = hoeffding_bounds(n=48491, alpha=0.05)\n", + "hfb_test" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "1b449363", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0021767186774462486" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# bounds for train set\n", + "hfb_train = hoeffding_bounds(n=389278, alpha=0.05)\n", + "hfb_train" + ] + }, + { + "cell_type": "markdown", + "id": "fe8720c6", + "metadata": {}, + "source": [ + "#### Table 1" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "640af725", + "metadata": {}, + "outputs": [], + "source": [ + "hfb_res = composite_res.copy()\n", + "hfb_res = hfb_res.drop(['UAS_test', 'UAS_train'], axis=1)\n", + "hfb_res.loc[:, 'LAS_test_lower'] = hfb_res.apply(lambda x: round(x['LAS_test'] - hfb_test, 4), axis=1)\n", + "hfb_res.loc[:, 'LAS_test_upper'] = hfb_res.apply(lambda x: round(x['LAS_test'] + hfb_test, 4), axis=1)\n", + "hfb_res.loc[:, 'LAS_train_lower'] = hfb_res.apply(lambda x: round(x['LAS_train'] - hfb_train, 4), axis=1)\n", + "hfb_res.loc[:, 'LAS_train_upper'] = hfb_res.apply(lambda x: round(x['LAS_train'] + hfb_train, 4), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "529e2f74", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_test_lowerLAS_testLAS_test_upperLAS_train_lowerLAS_trainLAS_train_upperLAS_gap
0eval_malt_morph_extended_full0.71240.71860.72480.84240.84460.84680.1260
1eval_udpipe1_default_morph_extended0.74980.75600.76220.84790.85010.85230.0941
2eval_udpipe1_embeddings_morph_extended0.76510.77130.77750.91250.91470.91690.1434
0eval_stanza_ud_on_stanza_auto_morph_full_data0.84570.85190.85810.92690.92910.93130.0772
8eval_stanza_morph_extended_gap_experiments_08_only_wordforms0.83450.84070.84690.91800.92020.92240.0795
0eval_stanza_morph_analysis_full_data0.84450.85070.85690.92090.92310.92530.0724
1eval_stanza_morph_extended_full_data (EDT_2.6)0.84240.84860.85480.91540.91760.91980.0689
0eval_stanza_morph_extended_full_data (EDT_2.11)0.84220.84840.85460.92250.92470.92690.0764
0eval_stanza_morph_extended_full_data_ensemble_majority_voting0.85020.85640.86260.93150.93370.93590.0773
0eval_stanza_ensemble_morph_analysis_full_data_default0.85110.85730.86350.93080.93300.93520.0757
1eval_stanza_ud_gold_morph_full_data0.87480.88100.88720.93580.93800.94020.0570
\n", + "
" + ], + "text/plain": [ + " experiment \\\n", + "0 eval_malt_morph_extended_full \n", + "1 eval_udpipe1_default_morph_extended \n", + "2 eval_udpipe1_embeddings_morph_extended \n", + "0 eval_stanza_ud_on_stanza_auto_morph_full_data \n", + "8 eval_stanza_morph_extended_gap_experiments_08_only_wordforms \n", + "0 eval_stanza_morph_analysis_full_data \n", + "1 eval_stanza_morph_extended_full_data (EDT_2.6) \n", + "0 eval_stanza_morph_extended_full_data (EDT_2.11) \n", + "0 eval_stanza_morph_extended_full_data_ensemble_majority_voting \n", + "0 eval_stanza_ensemble_morph_analysis_full_data_default \n", + "1 eval_stanza_ud_gold_morph_full_data \n", + "\n", + " LAS_test_lower LAS_test LAS_test_upper LAS_train_lower LAS_train \\\n", + "0 0.7124 0.7186 0.7248 0.8424 0.8446 \n", + "1 0.7498 0.7560 0.7622 0.8479 0.8501 \n", + "2 0.7651 0.7713 0.7775 0.9125 0.9147 \n", + "0 0.8457 0.8519 0.8581 0.9269 0.9291 \n", + "8 0.8345 0.8407 0.8469 0.9180 0.9202 \n", + "0 0.8445 0.8507 0.8569 0.9209 0.9231 \n", + "1 0.8424 0.8486 0.8548 0.9154 0.9176 \n", + "0 0.8422 0.8484 0.8546 0.9225 0.9247 \n", + "0 0.8502 0.8564 0.8626 0.9315 0.9337 \n", + "0 0.8511 0.8573 0.8635 0.9308 0.9330 \n", + "1 0.8748 0.8810 0.8872 0.9358 0.9380 \n", + "\n", + " LAS_train_upper LAS_gap \n", + "0 0.8468 0.1260 \n", + "1 0.8523 0.0941 \n", + "2 0.9169 0.1434 \n", + "0 0.9313 0.0772 \n", + "8 0.9224 0.0795 \n", + "0 0.9253 0.0724 \n", + "1 0.9198 0.0689 \n", + "0 0.9269 0.0764 \n", + "0 0.9359 0.0773 \n", + "0 0.9352 0.0757 \n", + "1 0.9402 0.0570 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hfb_res[['experiment', 'LAS_test_lower', 'LAS_test', 'LAS_test_upper', 'LAS_train_lower', 'LAS_train', 'LAS_train_upper', 'LAS_gap']]" + ] + }, + { + "cell_type": "markdown", + "id": "d6711679", + "metadata": {}, + "source": [ + "#### Table 3" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "253f564c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainUAS_testUAS_trainLAS_gap
0eval_stanza_morph_extended_full_data_conf_intervals0.8401; 0.8486; 0.85720.9161; 0.9176; 0.91910.8697; 0.8782; 0.88670.9363; 0.9378; 0.93930.0760; 0.0689; 0.0619
1eval_stanza_ensemble_morph_extended_full_data_las_coherence_conf_intervals0.8488; 0.8568; 0.86490.9327; 0.9337; 0.93470.8774; 0.8852; 0.89290.9505; 0.9515; 0.95250.0839; 0.0769; 0.0698
2eval_stanza_ensemble_morph_extended_full_data_majority_voting_conf_intervals0.8487; 0.8564; 0.86410.9327; 0.9337; 0.93470.8775; 0.8849; 0.89240.9507; 0.9517; 0.95270.0840; 0.0773; 0.0707
3eval_stanza_ensemble_morph_extended_half_data_las_coherence_conf_intervals0.8383; 0.8447; 0.85100.9134; 0.9152; 0.91690.8699; 0.8761; 0.88220.9325; 0.9344; 0.93630.0751; 0.0705; 0.0659
4eval_stanza_ensemble_morph_extended_half_data_majority_voting_conf_intervals0.8386; 0.8449; 0.85120.9109; 0.9126; 0.91430.8699; 0.8762; 0.88260.9306; 0.9325; 0.93440.0722; 0.0677; 0.0631
\n", + "
" + ], + "text/plain": [ + " experiment \\\n", + "0 eval_stanza_morph_extended_full_data_conf_intervals \n", + "1 eval_stanza_ensemble_morph_extended_full_data_las_coherence_conf_intervals \n", + "2 eval_stanza_ensemble_morph_extended_full_data_majority_voting_conf_intervals \n", + "3 eval_stanza_ensemble_morph_extended_half_data_las_coherence_conf_intervals \n", + "4 eval_stanza_ensemble_morph_extended_half_data_majority_voting_conf_intervals \n", + "\n", + " LAS_test LAS_train UAS_test \\\n", + "0 0.8401; 0.8486; 0.8572 0.9161; 0.9176; 0.9191 0.8697; 0.8782; 0.8867 \n", + "1 0.8488; 0.8568; 0.8649 0.9327; 0.9337; 0.9347 0.8774; 0.8852; 0.8929 \n", + "2 0.8487; 0.8564; 0.8641 0.9327; 0.9337; 0.9347 0.8775; 0.8849; 0.8924 \n", + "3 0.8383; 0.8447; 0.8510 0.9134; 0.9152; 0.9169 0.8699; 0.8761; 0.8822 \n", + "4 0.8386; 0.8449; 0.8512 0.9109; 0.9126; 0.9143 0.8699; 0.8762; 0.8826 \n", + "\n", + " UAS_train LAS_gap \n", + "0 0.9363; 0.9378; 0.9393 0.0760; 0.0689; 0.0619 \n", + "1 0.9505; 0.9515; 0.9525 0.0839; 0.0769; 0.0698 \n", + "2 0.9507; 0.9517; 0.9527 0.0840; 0.0773; 0.0707 \n", + "3 0.9325; 0.9344; 0.9363 0.0751; 0.0705; 0.0659 \n", + "4 0.9306; 0.9325; 0.9344 0.0722; 0.0677; 0.0631 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ensemble_conf_intervals = read_csv('edt_2.6/results_ensemble_conf_intervals.csv')\n", + "ensemble_conf_intervals" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "b5f79a20", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "eval_stanza_morph_extended_full_data_conf_intervals | 0.8401 0.8486 0.8572 | -0.0085 | +0.0086\n", + "eval_stanza_ensemble_morph_extended_full_data_las_coherence_conf_intervals | 0.8488 0.8568 0.8649 | -0.0080 | +0.0081\n", + "eval_stanza_ensemble_morph_extended_full_data_majority_voting_conf_intervals | 0.8487 0.8564 0.8641 | -0.0077 | +0.0077\n", + "eval_stanza_ensemble_morph_extended_half_data_las_coherence_conf_intervals | 0.8383 0.8447 0.8510 | -0.0064 | +0.0063\n", + "eval_stanza_ensemble_morph_extended_half_data_majority_voting_conf_intervals | 0.8386 0.8449 0.8512 | -0.0063 | +0.0063\n" + ] + } + ], + "source": [ + "# Decompose test LAS conf bounds\n", + "for i in ensemble_conf_intervals.index:\n", + " exp = ensemble_conf_intervals['experiment'][i]\n", + " parts = ensemble_conf_intervals['LAS_test'][i].split(';')\n", + " low, mid, high = [float(part.strip()) for part in parts]\n", + " out = f'{exp:77s} | {low:.4f} {mid:.4f} {high:.4f} | -{round(mid-low, 4):.4f} | +{round(high-mid,4):.4f}'\n", + " print(out)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "12a0869e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testtest_LAS_0_entropytest_LAS_0_entropy_total_words_%test_LAS_vs_entropy_corr
0eval_stanza_ensemble_morph_extended_full_data_las_coherence_entropy0.85680.925682.22140.4431
1eval_stanza_ensemble_morph_extended_full_data_majority_voting_entropy0.85640.925682.22140.4465
2eval_stanza_ensemble_morph_extended_half_data_las_coherence_entropy0.84460.940573.57860.4760
3eval_stanza_ensemble_morph_extended_half_data_majority_voting_entropy0.84490.940573.57860.4777
\n", + "
" + ], + "text/plain": [ + " experiment \\\n", + "0 eval_stanza_ensemble_morph_extended_full_data_las_coherence_entropy \n", + "1 eval_stanza_ensemble_morph_extended_full_data_majority_voting_entropy \n", + "2 eval_stanza_ensemble_morph_extended_half_data_las_coherence_entropy \n", + "3 eval_stanza_ensemble_morph_extended_half_data_majority_voting_entropy \n", + "\n", + " LAS_test test_LAS_0_entropy test_LAS_0_entropy_total_words_% \\\n", + "0 0.8568 0.9256 82.2214 \n", + "1 0.8564 0.9256 82.2214 \n", + "2 0.8446 0.9405 73.5786 \n", + "3 0.8449 0.9405 73.5786 \n", + "\n", + " test_LAS_vs_entropy_corr \n", + "0 0.4431 \n", + "1 0.4465 \n", + "2 0.4760 \n", + "3 0.4777 " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ensemble_entropy = read_csv('edt_2.6/results_ensemble_entropy.csv')\n", + "ensemble_entropy[['experiment', 'LAS_test', 'test_LAS_0_entropy', 'test_LAS_0_entropy_total_words_%', 'test_LAS_vs_entropy_corr']]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/07_smaller_data_exp_and_extrapolation.ipynb b/07_smaller_data_exp_and_extrapolation.ipynb new file mode 100644 index 00000000..9d0a7e15 --- /dev/null +++ b/07_smaller_data_exp_and_extrapolation.ipynb @@ -0,0 +1,1138 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0414f820", + "metadata": {}, + "source": [ + "## Draw figures about results of smaller data experiments" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d31710fd", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from pandas import DataFrame as df\n", + "from pandas import read_csv\n", + "import pandas\n", + "from plotnine import ggplot, geom_point, aes, geom_line, ylab\n", + "from plotnine import geom_abline, geom_hline, ggtitle, theme, theme_bw" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8a97e325", + "metadata": {}, + "outputs": [], + "source": [ + "df = read_csv('edt_2.6/results_smaller_data.csv') \\\n", + " .rename(columns={'LAS_test': 'test LAS'}) \\\n", + " .rename(columns={'LAS_train': 'train LAS'}) \\\n", + " .drop(['LAS_gap', 'UAS_test', 'UAS_train'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "11e86a5e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimenttest LAStrain LAStest_wordstrain_words
0eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0100.68640.94914849139886
1eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0200.74900.96364849177759
2eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0300.78190.936548491117642
3eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0400.79890.936648491157491
4eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0500.81520.923448491195477
5eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0600.82500.929948491235468
6eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0700.82410.917948491273422
7eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0800.83270.919448491311383
8eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_0900.83430.915148491349322
9eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_1000.83770.910448491389278
10eval_stanza_morph_extended_smaller_data_02_keep_all_0100.75090.92864849139886
11eval_stanza_morph_extended_smaller_data_02_keep_all_0200.78950.95524849177759
12eval_stanza_morph_extended_smaller_data_02_keep_all_0300.80620.926848491117642
13eval_stanza_morph_extended_smaller_data_02_keep_all_0400.82090.941448491157491
14eval_stanza_morph_extended_smaller_data_02_keep_all_0500.82990.936048491195477
15eval_stanza_morph_extended_smaller_data_02_keep_all_0600.84090.935048491235468
16eval_stanza_morph_extended_smaller_data_02_keep_all_0700.83630.924848491273422
17eval_stanza_morph_extended_smaller_data_02_keep_all_0800.84350.929548491311383
18eval_stanza_morph_extended_smaller_data_02_keep_all_0900.84630.926148491349322
19eval_stanza_morph_extended_smaller_data_02_keep_all_1000.84660.923648491389278
20eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0100.62850.81744849139886
21eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0200.65440.85424849177759
22eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0300.66520.784748491117642
23eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0400.67300.788948491157491
24eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0500.68020.787848491195477
25eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0600.68800.812448491235468
26eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0700.69150.800048491273422
27eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0800.69390.800148491311383
28eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_0900.69520.808148491349322
29eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_1000.69700.787448491389278
\n", + "
" + ], + "text/plain": [ + " experiment \n", + "0 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_010 \\\n", + "1 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_020 \n", + "2 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_030 \n", + "3 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_040 \n", + "4 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_050 \n", + "5 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_060 \n", + "6 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_070 \n", + "7 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_080 \n", + "8 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_090 \n", + "9 eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_100 \n", + "10 eval_stanza_morph_extended_smaller_data_02_keep_all_010 \n", + "11 eval_stanza_morph_extended_smaller_data_02_keep_all_020 \n", + "12 eval_stanza_morph_extended_smaller_data_02_keep_all_030 \n", + "13 eval_stanza_morph_extended_smaller_data_02_keep_all_040 \n", + "14 eval_stanza_morph_extended_smaller_data_02_keep_all_050 \n", + "15 eval_stanza_morph_extended_smaller_data_02_keep_all_060 \n", + "16 eval_stanza_morph_extended_smaller_data_02_keep_all_070 \n", + "17 eval_stanza_morph_extended_smaller_data_02_keep_all_080 \n", + "18 eval_stanza_morph_extended_smaller_data_02_keep_all_090 \n", + "19 eval_stanza_morph_extended_smaller_data_02_keep_all_100 \n", + "20 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_010 \n", + "21 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_020 \n", + "22 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_030 \n", + "23 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_040 \n", + "24 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_050 \n", + "25 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_060 \n", + "26 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_070 \n", + "27 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_080 \n", + "28 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_090 \n", + "29 eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_100 \n", + "\n", + " test LAS train LAS test_words train_words \n", + "0 0.6864 0.9491 48491 39886 \n", + "1 0.7490 0.9636 48491 77759 \n", + "2 0.7819 0.9365 48491 117642 \n", + "3 0.7989 0.9366 48491 157491 \n", + "4 0.8152 0.9234 48491 195477 \n", + "5 0.8250 0.9299 48491 235468 \n", + "6 0.8241 0.9179 48491 273422 \n", + "7 0.8327 0.9194 48491 311383 \n", + "8 0.8343 0.9151 48491 349322 \n", + "9 0.8377 0.9104 48491 389278 \n", + "10 0.7509 0.9286 48491 39886 \n", + "11 0.7895 0.9552 48491 77759 \n", + "12 0.8062 0.9268 48491 117642 \n", + "13 0.8209 0.9414 48491 157491 \n", + "14 0.8299 0.9360 48491 195477 \n", + "15 0.8409 0.9350 48491 235468 \n", + "16 0.8363 0.9248 48491 273422 \n", + "17 0.8435 0.9295 48491 311383 \n", + "18 0.8463 0.9261 48491 349322 \n", + "19 0.8466 0.9236 48491 389278 \n", + "20 0.6285 0.8174 48491 39886 \n", + "21 0.6544 0.8542 48491 77759 \n", + "22 0.6652 0.7847 48491 117642 \n", + "23 0.6730 0.7889 48491 157491 \n", + "24 0.6802 0.7878 48491 195477 \n", + "25 0.6880 0.8124 48491 235468 \n", + "26 0.6915 0.8000 48491 273422 \n", + "27 0.6939 0.8001 48491 311383 \n", + "28 0.6952 0.8081 48491 349322 \n", + "29 0.6970 0.7874 48491 389278 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pandas.set_option('display.max_colwidth', None)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6c75808d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
test LAStrain LAStest_wordstrain_wordsmodeltraining size
068.6494.914849139886stanza ME only wordforms10
174.9096.364849177759stanza ME only wordforms20
278.1993.6548491117642stanza ME only wordforms30
379.8993.6648491157491stanza ME only wordforms40
481.5292.3448491195477stanza ME only wordforms50
582.5092.9948491235468stanza ME only wordforms60
682.4191.7948491273422stanza ME only wordforms70
783.2791.9448491311383stanza ME only wordforms80
883.4391.5148491349322stanza ME only wordforms90
983.7791.0448491389278stanza ME only wordforms100
1075.0992.864849139886stanza ME all features10
1178.9595.524849177759stanza ME all features20
1280.6292.6848491117642stanza ME all features30
1382.0994.1448491157491stanza ME all features40
1482.9993.6048491195477stanza ME all features50
1584.0993.5048491235468stanza ME all features60
1683.6392.4848491273422stanza ME all features70
1784.3592.9548491311383stanza ME all features80
1884.6392.6148491349322stanza ME all features90
1984.6692.3648491389278stanza ME all features100
2062.8581.744849139886stanza ME only postags and feats10
2165.4485.424849177759stanza ME only postags and feats20
2266.5278.4748491117642stanza ME only postags and feats30
2367.3078.8948491157491stanza ME only postags and feats40
2468.0278.7848491195477stanza ME only postags and feats50
2568.8081.2448491235468stanza ME only postags and feats60
2669.1580.0048491273422stanza ME only postags and feats70
2769.3980.0148491311383stanza ME only postags and feats80
2869.5280.8148491349322stanza ME only postags and feats90
2969.7078.7448491389278stanza ME only postags and feats100
\n", + "
" + ], + "text/plain": [ + " test LAS train LAS test_words train_words \n", + "0 68.64 94.91 48491 39886 \\\n", + "1 74.90 96.36 48491 77759 \n", + "2 78.19 93.65 48491 117642 \n", + "3 79.89 93.66 48491 157491 \n", + "4 81.52 92.34 48491 195477 \n", + "5 82.50 92.99 48491 235468 \n", + "6 82.41 91.79 48491 273422 \n", + "7 83.27 91.94 48491 311383 \n", + "8 83.43 91.51 48491 349322 \n", + "9 83.77 91.04 48491 389278 \n", + "10 75.09 92.86 48491 39886 \n", + "11 78.95 95.52 48491 77759 \n", + "12 80.62 92.68 48491 117642 \n", + "13 82.09 94.14 48491 157491 \n", + "14 82.99 93.60 48491 195477 \n", + "15 84.09 93.50 48491 235468 \n", + "16 83.63 92.48 48491 273422 \n", + "17 84.35 92.95 48491 311383 \n", + "18 84.63 92.61 48491 349322 \n", + "19 84.66 92.36 48491 389278 \n", + "20 62.85 81.74 48491 39886 \n", + "21 65.44 85.42 48491 77759 \n", + "22 66.52 78.47 48491 117642 \n", + "23 67.30 78.89 48491 157491 \n", + "24 68.02 78.78 48491 195477 \n", + "25 68.80 81.24 48491 235468 \n", + "26 69.15 80.00 48491 273422 \n", + "27 69.39 80.01 48491 311383 \n", + "28 69.52 80.81 48491 349322 \n", + "29 69.70 78.74 48491 389278 \n", + "\n", + " model training size \n", + "0 stanza ME only wordforms 10 \n", + "1 stanza ME only wordforms 20 \n", + "2 stanza ME only wordforms 30 \n", + "3 stanza ME only wordforms 40 \n", + "4 stanza ME only wordforms 50 \n", + "5 stanza ME only wordforms 60 \n", + "6 stanza ME only wordforms 70 \n", + "7 stanza ME only wordforms 80 \n", + "8 stanza ME only wordforms 90 \n", + "9 stanza ME only wordforms 100 \n", + "10 stanza ME all features 10 \n", + "11 stanza ME all features 20 \n", + "12 stanza ME all features 30 \n", + "13 stanza ME all features 40 \n", + "14 stanza ME all features 50 \n", + "15 stanza ME all features 60 \n", + "16 stanza ME all features 70 \n", + "17 stanza ME all features 80 \n", + "18 stanza ME all features 90 \n", + "19 stanza ME all features 100 \n", + "20 stanza ME only postags and feats 10 \n", + "21 stanza ME only postags and feats 20 \n", + "22 stanza ME only postags and feats 30 \n", + "23 stanza ME only postags and feats 40 \n", + "24 stanza ME only postags and feats 50 \n", + "25 stanza ME only postags and feats 60 \n", + "26 stanza ME only postags and feats 70 \n", + "27 stanza ME only postags and feats 80 \n", + "28 stanza ME only postags and feats 90 \n", + "29 stanza ME only postags and feats 100 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[:, 'model'] = df.apply(lambda x: x['experiment'].replace('eval_stanza_morph_extended_smaller_data_', 'stanza_ME_'), axis=1)\n", + "df.loc[:, 'model'] = df.apply(lambda x: re.sub(r'_[0-9]+$','',x['model']), axis=1)\n", + "df.loc[:, 'model'] = df.apply(lambda x: re.sub(r'_09_only_pos_feats$','',x['model']), axis=1)\n", + "df.loc[:, 'model'] = df.apply(lambda x: re.sub(r'_08','',x['model']), axis=1)\n", + "# Rename models\n", + "df.loc[:, 'model'] = df.apply(lambda x: re.sub(r'stanza_ME_01_only_wordforms','stanza ME only wordforms',x['model']), axis=1)\n", + "df.loc[:, 'model'] = df.apply(lambda x: re.sub(r'stanza_ME_02_keep_all', 'stanza ME all features',x['model']), axis=1)\n", + "df.loc[:, 'model'] = df.apply(lambda x: re.sub(r'stanza_ME_03_only_pos_feats','stanza ME only postags and feats',x['model']), axis=1)\n", + "df.loc[:, 'training size'] = df.apply(lambda x: int((x['experiment'].split('_')[-1]).lstrip('0')), axis=1) \n", + "df = df.drop(['experiment'], axis=1)\n", + "# Normalize LAS scores\n", + "df['test LAS'] = df['test LAS'].multiply(100.0)\n", + "df['train LAS'] = df['train LAS'].multiply(100.0)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "95e64940", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Programmid\\Miniconda3\\envs\\py38_estnltk_neural\\lib\\site-packages\\plotnine\\ggplot.py:718: PlotnineWarning: Saving 10 x 8 in image.\n", + "C:\\Programmid\\Miniconda3\\envs\\py38_estnltk_neural\\lib\\site-packages\\plotnine\\ggplot.py:719: PlotnineWarning: Filename: figure_1_test.pdf\n" + ] + } + ], + "source": [ + "p = ggplot(df) + theme_bw() + geom_point(aes(x='training size', y='test LAS', color='model'))\n", + "p = p + geom_line(aes(x='training size', y='test LAS', color='model')) + theme(legend_position='top')\n", + "display(p)\n", + "p.save('figure_1_test.pdf', height=8, width=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c4457cc7", + "metadata": {}, + "outputs": [], + "source": [ + "# Separate test and train for results visualization\n", + "train_df = df.drop(['test LAS'], axis=1)\n", + "train_df.loc[:, 'eval set'] = df.apply(lambda x: 'train', axis=1)\n", + "train_df = train_df.rename(columns={'train LAS': 'LAS'})\n", + "test_df = df.drop(['train LAS'], axis=1)\n", + "test_df.loc[:, 'eval set'] = df.apply(lambda x: 'test', axis=1)\n", + "test_df = test_df.rename(columns={'test LAS': 'LAS'})\n", + "#joint_df = pandas.concat([train_df, test_df])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e0de7753", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Programmid\\Miniconda3\\envs\\py38_estnltk_neural\\lib\\site-packages\\plotnine\\ggplot.py:718: PlotnineWarning: Saving 6 x 10 in image.\n", + "C:\\Programmid\\Miniconda3\\envs\\py38_estnltk_neural\\lib\\site-packages\\plotnine\\ggplot.py:719: PlotnineWarning: Filename: figure_1_train_and_test.pdf\n" + ] + } + ], + "source": [ + "# Test and training set performance as a function of training set size (%)\n", + "from plotnine import labs, facet_wrap\n", + "p = ggplot(train_df) + theme_bw()\n", + "p = p + labs(x='training size (% of all training data)')\n", + "p = p + geom_point(train_df, aes(x='training size', y='LAS', color='model'))\n", + "p = p + geom_line(train_df, aes(x='training size', y='LAS', color='model'))\n", + "p = p + geom_point(test_df, aes(x='training size', y='LAS', color='model'))\n", + "p = p + geom_line(test_df, aes(x='training size', y='LAS', color='model'))\n", + "p = p + theme(legend_position='bottom', figure_size=(6, 10), legend_box_spacing=.25) + facet_wrap(['eval set'], ncol=1)\n", + "display(p)\n", + "p.save('figure_1_train_and_test.pdf')" + ] + }, + { + "cell_type": "markdown", + "id": "f23456a5", + "metadata": {}, + "source": [ + "## Extrapolation of test and training errors\n", + "\n", + "* We use the range 70% - 100% for the extrapolation\n", + "* We use linear regression to find the linear trend\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1ce80300", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "stanza ME only wordforms\n", + "Test fit: 0.04239999999999965 * x + 79.61600000000003\n", + "Train fit: -0.02680000000000035 * x + 93.84800000000003\n", + "Improvement rate 108.92\n", + "Min stablisation size: 205.66% 800608\n", + "Limiting training error: 8833.62%\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "stanza ME all features\n", + "Test fit: 0.03369999999999961 * x + 81.45300000000003\n", + "Train fit: -0.006999999999999744 * x + 93.195\n", + "Improvement rate 86.57\n", + "Min stablisation size: 288.50% 1123072\n", + "Limiting training error: 9117.55%\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "stanza ME only postags and feats\n", + "Test fit: 0.017799999999999584 * x + 67.92700000000004\n", + "Train fit: -0.029800000000000177 * x + 82.42300000000002\n", + "Improvement rate 45.73\n", + "Min stablisation size: 304.54% 1185499\n", + "Limiting training error: 7334.78%\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "\n", + "#\n", + "# (A) this is training set size if we ignore punctuation \n", + "# tokens while calculating LAS scores.\n", + "#\n", + "tr_size = 325531\n", + "\n", + "#\n", + "# (B) this is training set size if we include punctuation \n", + "# tokens while calculating LAS scores.\n", + "#\n", + "tr_size = 389278\n", + "\n", + "plots = []\n", + "lm = LinearRegression()\n", + "for model in df['model'].unique():\n", + "\n", + " print(model)\n", + " data = df.set_index('model').loc[model]\n", + "\n", + " # Test\n", + " X = data.loc[data['training size']>=70, ['training size']]\n", + " y = data.loc[data['training size']>=70, 'test LAS']\n", + " lm.fit(X,y) \n", + " a = lm.coef_[0]\n", + " b = lm.intercept_\n", + " print('Test fit: {} * x + {}'.format(a, b))\n", + "\n", + " X = data.loc[data['training size']>=70, ['training size']]\n", + " y = data.loc[data['training size']>=70, 'train LAS']\n", + " lm.fit(X,y) \n", + " c = lm.coef_[0]\n", + " d = lm.intercept_\n", + " print('Train fit: {} * x + {}'.format(c, d))\n", + "\n", + " print('Improvement rate {:.2f}'.format((100 * a) * (100000/tr_size * 100)) )\n", + "\n", + " e = (d-b)/(a-c)\n", + " lim_las = (a * e + b)\n", + " print('Min stablisation size: {:.2f}% {:.0f}'.format(e, e * tr_size / 100))\n", + " print('Limiting training error: {:.2f}%'.format(100 * lim_las)) \n", + "\n", + " p = ggplot(data) + theme_bw() + geom_point(aes(x='training size', y='test LAS'), color='blue')\n", + " p = p + geom_line(aes(x='training size', y='test LAS'), color='blue')\n", + " p = p + geom_line(aes(x='training size', y='train LAS'), color='red')\n", + " p = p + geom_abline(intercept=b, slope=a, linetype='dashed', color = 'blue')\n", + " p = p + geom_abline(intercept=d, slope=c, linetype='dashed', color = 'red')\n", + " p = p + geom_hline(yintercept=lim_las, linetype='dashed')\n", + " p = p + ylab(\"LAS\")\n", + " p = p + ggtitle(model)\n", + " plots.append(p)\n", + " display(p)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b0505ab7", + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# Install patchworklib which allows to arrange multiple plots into a single plot\n", + "# https://github.com/ponnhide/patchworklib\n", + "#\n", + "!pip install patchworklib" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a2ad35d0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "meta NOT subset; don't know how to subset; dropped\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import patchworklib as pw\n", + "\n", + "p1 = pw.load_ggplot(plots[1], figsize=(4,4))\n", + "p2 = pw.load_ggplot(plots[0], figsize=(4,4))\n", + "p3 = pw.load_ggplot(plots[2], figsize=(4,4))\n", + "\n", + "arrangement = (p1|p2)[p1]/p3\n", + "arrangement.savefig(\"train_and_test_linear_trends.pdf\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/08_results_clauses_and_sketches_knockout_5groups.ipynb b/08_results_clauses_and_sketches_knockout_5groups.ipynb new file mode 100644 index 00000000..0f6352db --- /dev/null +++ b/08_results_clauses_and_sketches_knockout_5groups.ipynb @@ -0,0 +1,1229 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "92b47bf4", + "metadata": {}, + "source": [ + "## Results of the clauses experiment and the syntactic sketch knockout experiments" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b658bcdc", + "metadata": {}, + "outputs": [], + "source": [ + "import os, os.path\n", + "import re\n", + "\n", + "from pandas import DataFrame as df\n", + "from pandas import read_csv\n", + "import pandas\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8217e875", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "results_crossvalidation.csv\n", + "results_ensemble_conf_intervals.csv\n", + "results_ensemble_entropy.csv\n", + "results_ensemble_majority_voting.csv\n", + "results_full_data_malt_udpipe1.csv\n", + "results_gap_experiments.csv\n", + "results_gold_and_auto_ud_morph.csv\n", + "results_half_data.csv\n", + "results_smaller_data.csv\n", + "results_stanza_basic.csv\n", + "results_stanza_ME_conf_intervals.csv\n", + "results_stanza_ME_error_types.csv\n", + "results_stanza_ME_full_predict_on_clauses.csv\n", + "results_stanza_ME_full_predict_on_clauses_error_types.csv\n", + "results_stanza_ME_on_clauses.csv\n", + "results_stanza_ME_on_clauses_error_types.csv\n", + "results_stanza_ME_sketches_5groups_knockout.csv\n", + "results_stanza_ME_sketches_5groups_knockout_matrix.csv\n", + "results_stanza_ME_sketches_5randomgroups_knockout.csv\n", + "results_stanza_ME_sketches_5randomgroups_knockout_matrix.csv\n" + ] + } + ], + "source": [ + "input_dir = 'edt_2.6' # Experiments done on Estonian Dependency Treebank version 2.6\n", + "\n", + "for fname in os.listdir(input_dir):\n", + " if fname.startswith('results_') and fname.endswith('.csv'):\n", + " print(fname)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f8ecd862", + "metadata": {}, + "outputs": [], + "source": [ + "pandas.set_option('display.max_colwidth', None)" + ] + }, + { + "cell_type": "markdown", + "id": "1060eac5", + "metadata": {}, + "source": [ + "### A. the clauses experiment: train/evaluate stanza on clauses instead of on sentences" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bf03c093", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LAS_testLAS_trainLAS_gapUAS_testUAS_train
experiment
eval_stanza_morph_extended_on_clauses0.86230.92980.06740.89240.9492
\n", + "
" + ], + "text/plain": [ + " LAS_test LAS_train LAS_gap UAS_test \n", + "experiment \n", + "eval_stanza_morph_extended_on_clauses 0.8623 0.9298 0.0674 0.8924 \\\n", + "\n", + " UAS_train \n", + "experiment \n", + "eval_stanza_morph_extended_on_clauses 0.9492 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# trained and evaluated on clauses\n", + "read_csv('edt_2.6/results_stanza_ME_on_clauses.csv', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f71e9b5e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
experimentLAS_testLAS_trainLAS_gapUAS_testUAS_traintest_wordstrain_words
1eval_stanza_morph_extended_full_data0.84860.91760.06890.87820.937848491389278
\n", + "
" + ], + "text/plain": [ + " experiment LAS_test LAS_train LAS_gap \n", + "1 eval_stanza_morph_extended_full_data 0.8486 0.9176 0.0689 \\\n", + "\n", + " UAS_test UAS_train test_words train_words \n", + "1 0.8782 0.9378 48491 389278 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# for a comparison #1: trained and evaluated on sentences\n", + "full_data_results = read_csv('edt_2.6/results_stanza_basic.csv')\n", + "full_data_results[full_data_results['experiment'].str.contains('morph_extended')]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fad4c2ef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LAS_testLAS_trainUAS_testUAS_trainLAS_gap
experiment
eval_stanza_morph_extended_full_predict_on_clauses0.85440.90760.88730.93410.0532
\n", + "
" + ], + "text/plain": [ + " LAS_test LAS_train \n", + "experiment \n", + "eval_stanza_morph_extended_full_predict_on_clauses 0.8544 0.9076 \\\n", + "\n", + " UAS_test UAS_train \n", + "experiment \n", + "eval_stanza_morph_extended_full_predict_on_clauses 0.8873 0.9341 \\\n", + "\n", + " LAS_gap \n", + "experiment \n", + "eval_stanza_morph_extended_full_predict_on_clauses 0.0532 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# for a comparison #2: trained on sentences and evaluated on clauses\n", + "read_csv('edt_2.6/results_stanza_ME_full_predict_on_clauses.csv', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6dc2b383", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
E1E2E3E1_impactE2_impactE3_impactE1_rel_errorE2_rel_errorE3_rel_error
experiment
stanza_ME_on_clauses_error_types_on_test45013273370.87140.06330.06520.14050.01020.0555
\n", + "
" + ], + "text/plain": [ + " E1 E2 E3 E1_impact \n", + "experiment \n", + "stanza_ME_on_clauses_error_types_on_test 4501 327 337 0.8714 \\\n", + "\n", + " E2_impact E3_impact E1_rel_error \n", + "experiment \n", + "stanza_ME_on_clauses_error_types_on_test 0.0633 0.0652 0.1405 \\\n", + "\n", + " E2_rel_error E3_rel_error \n", + "experiment \n", + "stanza_ME_on_clauses_error_types_on_test 0.0102 0.0555 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Error types of the model trained and evaluated on clauses\n", + "err_types1 = read_csv('edt_2.6/results_stanza_ME_on_clauses_error_types.csv', index_col=0)\n", + "err_types1[['E1', 'E2', 'E3', 'E1_impact', 'E2_impact', 'E3_impact', 'E1_rel_error', 'E2_rel_error', 'E3_rel_error', ]]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "42299eb8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
E1E2E3E1_impactE2_impactE3_impactE1_rel_errorE2_rel_errorE3_rel_error
experiment
stanza_ME_error_types_on_test482139811620.75550.06240.18210.14130.01170.1733
\n", + "
" + ], + "text/plain": [ + " E1 E2 E3 E1_impact E2_impact \n", + "experiment \n", + "stanza_ME_error_types_on_test 4821 398 1162 0.7555 0.0624 \\\n", + "\n", + " E3_impact E1_rel_error E2_rel_error \n", + "experiment \n", + "stanza_ME_error_types_on_test 0.1821 0.1413 0.0117 \\\n", + "\n", + " E3_rel_error \n", + "experiment \n", + "stanza_ME_error_types_on_test 0.1733 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# for a comparison #1: Error types of the model trained and evaluated on sentences\n", + "err_types2 = read_csv('edt_2.6/results_stanza_ME_error_types.csv', index_col=0)\n", + "err_types2[['E1', 'E2', 'E3', 'E1_impact', 'E2_impact', 'E3_impact', 'E1_rel_error', 'E2_rel_error', 'E3_rel_error', ]]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1e12969c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
E1E2E3E1_impactE2_impactE3_impactE1_rel_errorE2_rel_errorE3_rel_error
experiment
stanza_ME_full_on_clauses_error_types_on_test47743523550.8710.06420.06480.1490.0110.0585
\n", + "
" + ], + "text/plain": [ + " E1 E2 E3 E1_impact \n", + "experiment \n", + "stanza_ME_full_on_clauses_error_types_on_test 4774 352 355 0.871 \\\n", + "\n", + " E2_impact E3_impact \n", + "experiment \n", + "stanza_ME_full_on_clauses_error_types_on_test 0.0642 0.0648 \\\n", + "\n", + " E1_rel_error E2_rel_error \n", + "experiment \n", + "stanza_ME_full_on_clauses_error_types_on_test 0.149 0.011 \\\n", + "\n", + " E3_rel_error \n", + "experiment \n", + "stanza_ME_full_on_clauses_error_types_on_test 0.0585 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# for a comparison #2: Error types of the model trained on sentences and evaluated on clauses\n", + "err_types2 = read_csv('edt_2.6/results_stanza_ME_full_predict_on_clauses_error_types.csv', index_col=0)\n", + "err_types2[['E1', 'E2', 'E3', 'E1_impact', 'E2_impact', 'E3_impact', 'E1_rel_error', 'E2_rel_error', 'E3_rel_error', ]]" + ] + }, + { + "cell_type": "markdown", + "id": "9ca17669", + "metadata": {}, + "source": [ + "### B. the syntactic sketches knockout experiments" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "043a8d4d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
grouped_sketchessupport
0[V]nsubj(L)obl(L);[V]nsubj(L)obl(L)obl(P);[S]advmod(L)cop(L)nsubj:cop(L);[S]cop(L)nsubj:cop(P);[V]aux(L)nsubj(L)obl(L);[V]obj(L)obl(L);[V]nsubj(L)xcomp(P);[V]obj(P)obl(L);[V]obj(P);[V]obj(L)4157
1[S]nmod(L);[V]advmod(L)nsubj(P);[V]aux(L)nsubj(L)obj(L);[S]amod(L);[V]advmod(L);[V];[V]nsubj(L)obj(P);[S];[V]nsubj(L)obj(P)obl(L);[V]nsubj(P)obl(L)5167
2[V]obj(L)obl(P);[V]nsubj(L)obj(L);[S]amod(L)cop(L)nsubj:cop(L);[V]advmod(L)obl(L);[S]cop(L)nmod(L)nsubj:cop(L);[V]advmod(L)nsubj(L)obj(L);[V]nsubj(L)obj(L)obl(P);[V]advmod(L)nsubj(L)obj(L)obl(L);[V]advmod(L)obj(L);[S]cop(L)nsubj:cop(L)3374
3[V]nsubj(L)obj(L)obl(L);[S]advmod(L);[S]cop(L)nsubj:cop(L)obl(L);[V]nsubj(L)obl(P);[V]nsubj(P)obl(P);[V]advmod(L)nsubj(L)obl(P);[V]nsubj(L);[V]advmod(L)nsubj(L);[V]nsubj(L)xcomp(L);[V]obl(P)4904
4[V]nsubj(L)obl(L)obl(L);[V]advmod(L)nsubj(L)obl(L);[V]aux(L);[X];[V]xcomp(P);[V]aux(L)nsubj(L);[S]nummod(L);[V]nsubj(P);[V]nsubj(P)obj(L);[V]obl(L)3332
\n", + "
" + ], + "text/plain": [ + " grouped_sketches \n", + "0 [V]nsubj(L)obl(L);[V]nsubj(L)obl(L)obl(P);[S]advmod(L)cop(L)nsubj:cop(L);[S]cop(L)nsubj:cop(P);[V]aux(L)nsubj(L)obl(L);[V]obj(L)obl(L);[V]nsubj(L)xcomp(P);[V]obj(P)obl(L);[V]obj(P);[V]obj(L) \\\n", + "1 [S]nmod(L);[V]advmod(L)nsubj(P);[V]aux(L)nsubj(L)obj(L);[S]amod(L);[V]advmod(L);[V];[V]nsubj(L)obj(P);[S];[V]nsubj(L)obj(P)obl(L);[V]nsubj(P)obl(L) \n", + "2 [V]obj(L)obl(P);[V]nsubj(L)obj(L);[S]amod(L)cop(L)nsubj:cop(L);[V]advmod(L)obl(L);[S]cop(L)nmod(L)nsubj:cop(L);[V]advmod(L)nsubj(L)obj(L);[V]nsubj(L)obj(L)obl(P);[V]advmod(L)nsubj(L)obj(L)obl(L);[V]advmod(L)obj(L);[S]cop(L)nsubj:cop(L) \n", + "3 [V]nsubj(L)obj(L)obl(L);[S]advmod(L);[S]cop(L)nsubj:cop(L)obl(L);[V]nsubj(L)obl(P);[V]nsubj(P)obl(P);[V]advmod(L)nsubj(L)obl(P);[V]nsubj(L);[V]advmod(L)nsubj(L);[V]nsubj(L)xcomp(L);[V]obl(P) \n", + "4 [V]nsubj(L)obl(L)obl(L);[V]advmod(L)nsubj(L)obl(L);[V]aux(L);[X];[V]xcomp(P);[V]aux(L)nsubj(L);[S]nummod(L);[V]nsubj(P);[V]nsubj(P)obj(L);[V]obl(L) \n", + "\n", + " support \n", + "0 4157 \n", + "1 5167 \n", + "2 3374 \n", + "3 4904 \n", + "4 3332 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Randomly grouped sketches\n", + "read_csv('edt_2.6/preannotated/morph_extended_clauses/top_50_sketches_5_groups.csv', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2f222980", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
group0group1group2group3group4
group089.3190.4691.0791.3990.93
group191.9390.4791.8991.5491.54
group292.9392.5091.3191.8692.46
group391.3691.2291.8689.0391.68
group491.9491.7992.0492.0489.47
\n", + "
" + ], + "text/plain": [ + " group0 group1 group2 group3 group4\n", + "group0 89.31 90.46 91.07 91.39 90.93\n", + "group1 91.93 90.47 91.89 91.54 91.54\n", + "group2 92.93 92.50 91.31 91.86 92.46\n", + "group3 91.36 91.22 91.86 89.03 91.68\n", + "group4 91.94 91.79 92.04 92.04 89.47" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Test matrix: rows are test sets, columns are models\n", + "test_matrix = read_csv('edt_2.6/results_stanza_ME_sketches_5groups_knockout_matrix.csv', index_col=0)\n", + "test_matrix = test_matrix.mul(100.0)\n", + "test_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1298085d", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig = sns.heatmap(test_matrix).get_figure()\n", + "plt.xlabel('Training (excludes group)', fontsize = 12)\n", + "plt.ylabel('Evaluation (exclusive to group)', fontsize = 12)\n", + "fig.savefig(\"sketches_5groups_knockout_matrix.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f747e8d7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
group0group1group2group3group4
group091.1490.5791.5091.1491.00
group191.2991.7691.5490.9891.54
group291.9892.1892.3091.8691.98
group391.7991.5491.3691.6191.82
group492.3991.8491.8491.8492.04
\n", + "
" + ], + "text/plain": [ + " group0 group1 group2 group3 group4\n", + "group0 91.14 90.57 91.50 91.14 91.00\n", + "group1 91.29 91.76 91.54 90.98 91.54\n", + "group2 91.98 92.18 92.30 91.86 91.98\n", + "group3 91.79 91.54 91.36 91.61 91.82\n", + "group4 92.39 91.84 91.84 91.84 92.04" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Random control group test matrix: rows are test sets, columns are models\n", + "rnd_test_matrix = read_csv('edt_2.6/results_stanza_ME_sketches_5randomgroups_knockout_matrix.csv', index_col=0)\n", + "rnd_test_matrix = rnd_test_matrix.mul(100.0)\n", + "rnd_test_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8618dc6b", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig = sns.heatmap(rnd_test_matrix).get_figure()\n", + "plt.xlabel('Training (excludes group)', fontsize = 12)\n", + "plt.ylabel('Evaluation (exclusive to group)', fontsize = 12)\n", + "fig.savefig(\"sketches_5groups_random_knockout_matrix.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "13887eaf", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Display both heatmaps on a single figure\n", + "import matplotlib.pyplot as plt\n", + "\n", + "vmin = min(test_matrix.values.min(), rnd_test_matrix.values.min())\n", + "vmax = max(test_matrix.values.max(), rnd_test_matrix.values.max())\n", + "\n", + "fig, axs = plt.subplots(ncols=3, gridspec_kw=dict(width_ratios=[10,10,1]), figsize=(10, 4))\n", + "\n", + "sns.heatmap(test_matrix, annot=True, cbar=False, ax=axs[0], vmin=vmin, vmax=vmax)\n", + "sns.heatmap(rnd_test_matrix, annot=True, yticklabels=False, cbar=False, ax=axs[1], vmax=vmax, vmin=vmin)\n", + "\n", + "axs[0].set_title('main experiment', fontsize = 10)\n", + "axs[1].set_title('randomized control experiment', fontsize = 10)\n", + "axs[0].set_xlabel('Training (excludes group)', fontsize = 10)\n", + "axs[1].set_xlabel('Training (excludes group)', fontsize = 10)\n", + "axs[0].set_ylabel('Evaluation (exclusive to group)', fontsize = 10)\n", + "\n", + "fig.colorbar(axs[1].collections[0], cax=axs[2])\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "f080d4ba", + "metadata": {}, + "outputs": [], + "source": [ + "fig.savefig(\"sketches_5groups_both_experiment_matrices.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d86a4d5e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LAS_test
experiment
eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group091.42
eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group191.26
eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group291.61
eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group391.08
eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group491.29
\n", + "
" + ], + "text/plain": [ + " LAS_test\n", + "experiment \n", + "eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group0 91.42\n", + "eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group1 91.26\n", + "eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group2 91.61\n", + "eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group3 91.08\n", + "eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group4 91.29" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Results on test_50x50 file\n", + "r1 = read_csv('edt_2.6/results_stanza_ME_sketches_5groups_knockout.csv', index_col=0).drop(['UAS_test'], axis=1)\n", + "r1.mul(100.0)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "cf1e99b5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LAS_test
experiment
eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group091.69
eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group191.54
eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group291.69
eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group391.48
eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group491.65
\n", + "
" + ], + "text/plain": [ + " LAS_test\n", + "experiment \n", + "eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group0 91.69\n", + "eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group1 91.54\n", + "eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group2 91.69\n", + "eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group3 91.48\n", + "eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group4 91.65" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Results on test_50x50 file (random control group)\n", + "r2 = read_csv('edt_2.6/results_stanza_ME_sketches_5randomgroups_knockout.csv', index_col=0).drop(['UAS_test'], axis=1)\n", + "r2.mul(100.0)" + ] + }, + { + "cell_type": "markdown", + "id": "566c2596", + "metadata": {}, + "source": [ + "#### Get Hoeffding's confidence bounds for test groups" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "dccae904", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "test_group0.conllu | words: 2777 | Hoeffding's bound: ± 2.58%\n", + "test_group1.conllu | words: 2318 | Hoeffding's bound: ± 2.82%\n", + "test_group2.conllu | words: 2532 | Hoeffding's bound: ± 2.70%\n", + "test_group3.conllu | words: 2825 | Hoeffding's bound: ± 2.56%\n", + "test_group4.conllu | words: 1985 | Hoeffding's bound: ± 3.05%\n", + "test_50x50.conllu | words: 12437 | Hoeffding's bound: ± 1.22%\n" + ] + } + ], + "source": [ + "import math\n", + "import os, os.path\n", + "import conllu\n", + "\n", + "# Counts words in conllu file.\n", + "def count_words(conllu_file):\n", + " with open(conllu_file, 'r', encoding='utf-8') as input_file:\n", + " conll_sentences = conllu.parse(input_file.read())\n", + " words = 0\n", + " for sentence in conll_sentences:\n", + " for word in sentence:\n", + " words += 1\n", + " return words\n", + "\n", + "# Calculates Hoeffding's bound. \n", + "# Parameters:\n", + "# n -- evaluation set size in words\n", + "# alpha -- confidence level is (1 - alpha)\n", + "def hoeffding_bounds(n, alpha):\n", + " return math.sqrt( (1/(2*n)*math.log(2/(alpha))) )\n", + "\n", + "# Calculate Hoeffding's bound for knockout test files\n", + "input_dirs = ['edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_splits', \\\n", + " 'edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_initial']\n", + "for input_dir in input_dirs:\n", + " for fname in os.listdir(input_dir):\n", + " if fname.startswith('test_'):\n", + " fpath = os.path.join(input_dir, fname)\n", + " words = count_words(fpath)\n", + " hfb_test = hoeffding_bounds(n=words, alpha=0.05)*100.0\n", + " print(fname, f\"| words: {words} | Hoeffding's bound: ± {hfb_test:.2f}%\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/README.md b/README.md deleted file mode 100644 index 545e8f67..00000000 --- a/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# Syntax experiments - -Various experiments with syntax analysers splitted among different branches. - -### I. Ablation experiments - -Investigations on the impact of training set size, morphological features and clausal patterns on syntactic parsing; experiments with parsing model ensembles; automatic evaluation and investigation of parsing errors - -**Branch:** [ablation_experiments](https://github.com/estnltk/syntax_experiments/tree/ablation_experiments) - - -### II. Syntax consistency - -Search for sentence level modifications that preserve syntax. In theory, syntax should be conserved when free entities are removed from sentences. Sometimes this is also true for bound entities such as objects. In practice, additional factors, such as interpunctuation and wording consistency, have a big inpact on the outcomes. - -The branch contains workflows for shortening sentences and preparing the results for manual labelling followed by small analysis of the results. Experiments are done for different phrase types where each phrase type is defined by the dependency relation predicted by stanza syntax analyser. - -**Branch:** [syntax_consistency](https://github.com/estnltk/syntax_experiments/tree/syntax_consistency) - - -### III. Consistency between adverbial phrases and named entities - -Adverbial phrases often coincide with geographical location or time expression, but not always. In these experiments, we study this problem in detail. For that we use a stanza syntax analyzer to extract adverbials and dedicated taggers for isolating named entities and time expressions. After that we build a workflow for extracting sentences where these annaotations are in potential conflict and extract corresponding sentences for manual labelling. - - -**Branch:** [adverbials](https://github.com/estnltk/syntax_experiments/tree/adverbials) - -### IV. Subcategorisation and argument structure - -Statistical methods for extracting information about subcategorisation of verbs using only automatically generated syntax for a large text corpus. -Subcategorization for verbs is defined by argument structure that specifies a list of selected arguments associated with specific lexical restrictions. -For Estonian, these restrictions are defined in terms of plausible cases. -It is imporant to note that not all arguments in the argument structure are mandatory and that the same verb can have more than one argument structure. - -Still it possible to use law of large numbers to extract important information about argument structure. -For that, we tabulate syntax level collocations between verb phrases and noun phrases defined through obl dependency relation. - -**Branch:** [subcat](https://github.com/estnltk/syntax_experiments/tree/subcat) - -### V. Semantic labelling - -Semantical categorisation based on the arguments structure. -Verbs place semantic restrictions on their arguments. -This can be exploited to categorise nouns into sementic categories and the other way around -- find whether a particular argument of a verb must satisfy certain restrictions. - -**Branch:** [semantic_labelling](https://github.com/estnltk/syntax_experiments/tree/semantic_labelling) - -### VI. Outdated experiments - -Contains code of legacy experiments that are no longer supported - -**Branch:** [legacy](https://github.com/estnltk/syntax_experiments/tree/legacy) diff --git a/X1_previously_reported_performances.ipynb b/X1_previously_reported_performances.ipynb new file mode 100644 index 00000000..0b5a6ee8 --- /dev/null +++ b/X1_previously_reported_performances.ipynb @@ -0,0 +1,917 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c5afafad", + "metadata": {}, + "source": [ + "### Previously reported dependency parsing performances for Estonian and English\n", + "\n", + "Sources:\n", + "\n", + "* Stanford @ CoNLL 2017 Shared Task: https://universaldependencies.org/conll17/results-las.html\n", + "* HIT-SCIR, StanfordNLP & UDPipe 1.2 @ CoNLL 2018 Shared Task: https://universaldependencies.org/conll18/results-las.html\n", + "* Stanza version 1.0: https://stanfordnlp.github.io/stanza/v100performance.html\n", + "* Stanza version 1.3: https://stanfordnlp.github.io/stanza/v130performance.html\n", + "* UDPipe 2: https://ufal.mff.cuni.cz/udpipe/2/models \n", + "* Udify: https://arxiv.org/pdf/1904.02099.pdf\n", + "* Trankit: https://trankit.readthedocs.io/en/latest/performance.html\n", + "* RobertNLP @ IWPT 2021 Shared Task: https://universaldependencies.org/iwpt21/results_official_fine.html ; https://aclanthology.org/2021.iwpt-1.21.pdf\n", + "\n", + "* Corpus sizes:\n", + " * Estonian:\n", + " * https://github.com/UniversalDependencies/UD_Estonian-EDT/blob/r2.0/stats.xml\n", + " * https://github.com/UniversalDependencies/UD_Estonian-EDT/blob/r2.2/stats.xml\n", + " * https://github.com/UniversalDependencies/UD_Estonian-EDT/blob/r2.5/stats.xml\n", + " * https://github.com/UniversalDependencies/UD_Estonian-EDT/blob/r2.7/stats.xml\n", + " * https://github.com/UniversalDependencies/UD_Estonian-EWT/blob/r2.7/stats.xml\n", + " * https://github.com/UniversalDependencies/UD_Estonian-EDT/blob/r2.8/stats.xml\n", + " * https://github.com/UniversalDependencies/UD_Estonian-EDT/blob/r2.10/stats.xml\n", + " * English:\n", + " * https://github.com/UniversalDependencies/UD_English-EWT/blob/r2.0/stats.xml\n", + " * https://github.com/UniversalDependencies/UD_English-EWT/blob/r2.2/stats.xml\n", + " * https://github.com/UniversalDependencies/UD_English-EWT/blob/r2.5/stats.xml\n", + " * https://github.com/UniversalDependencies/UD_English-EWT/blob/r2.7/stats.xml\n", + " * https://github.com/UniversalDependencies/UD_English-GUM/blob/r2.7/stats.xml\n", + " * https://github.com/UniversalDependencies/UD_English-EWT/blob/r2.6/stats.xml\n", + " * https://github.com/UniversalDependencies/UD_English-EWT/blob/r2.10/stats.xml\n", + "\n", + "(last checked: 2024-02-01)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "243cc05f", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install adjustText" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b89be590", + "metadata": {}, + "outputs": [], + "source": [ + "from plotnine import ggplot, geom_point, aes, geom_line, geom_text\n", + "from plotnine import geom_abline, geom_hline, ggtitle, theme, xlab, labs\n", + "from plotnine import scale_color_manual, element_line, element_text, theme_bw\n", + "from plotnine import facet_wrap, geom_step\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e89c15a0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ud_versionexact_corpus_sizecorpus_sizeLASparsercorpusyear
02.0346283471.65Stanfordet_edt2017
12.236635136685.35HIT-SCIRet_edt2018
22.236635136683.84StanfordNLPet_edt2018
32.543776943783.82Stanza v1.0et_edt2020
42.843776743784.43Stanza v1.3et_edt2021
52.236635136675.02UDPipe 1.2et_edt2018
62.236635136683.26UDPipe 2 prototypeet_edt2018
72.143785043786.16UDPipe 2et_edt2022
82.3120323091203286.67Udify v1.0et_edt2019
92.749416849389.52RobertNLPet_edt2021
102.543776943789.52Trankit-Let_edt2021
112.022975322982.23Stanforden_ewt2017
122.225482925484.57HIT-SCIRen_ewt2018
132.225482925483.87StanfordNLPen_ewt2018
142.525482925483.59Stanza v1.0en_ewt2020
152.825483025484.91Stanza v1.3en_ewt2021
162.225482925477.56UDPipe 1.2en_ewt2018
172.225482925482.51UDPipe 2 prototypeen_ewt2018
182.125482125488.10UDPipe 2en_ewt2022
192.3120323091203288.50Udify v1.0en_ewt2019
202.736821436790.27RobertNLPen_ewt2021
212.525482925489.40Trankit-Len_ewt2021
\n", + "
" + ], + "text/plain": [ + " ud_version exact_corpus_size corpus_size LAS parser \n", + "0 2.0 34628 34 71.65 Stanford \\\n", + "1 2.2 366351 366 85.35 HIT-SCIR \n", + "2 2.2 366351 366 83.84 StanfordNLP \n", + "3 2.5 437769 437 83.82 Stanza v1.0 \n", + "4 2.8 437767 437 84.43 Stanza v1.3 \n", + "5 2.2 366351 366 75.02 UDPipe 1.2 \n", + "6 2.2 366351 366 83.26 UDPipe 2 prototype \n", + "7 2.1 437850 437 86.16 UDPipe 2 \n", + "8 2.3 12032309 12032 86.67 Udify v1.0 \n", + "9 2.7 494168 493 89.52 RobertNLP \n", + "10 2.5 437769 437 89.52 Trankit-L \n", + "11 2.0 229753 229 82.23 Stanford \n", + "12 2.2 254829 254 84.57 HIT-SCIR \n", + "13 2.2 254829 254 83.87 StanfordNLP \n", + "14 2.5 254829 254 83.59 Stanza v1.0 \n", + "15 2.8 254830 254 84.91 Stanza v1.3 \n", + "16 2.2 254829 254 77.56 UDPipe 1.2 \n", + "17 2.2 254829 254 82.51 UDPipe 2 prototype \n", + "18 2.1 254821 254 88.10 UDPipe 2 \n", + "19 2.3 12032309 12032 88.50 Udify v1.0 \n", + "20 2.7 368214 367 90.27 RobertNLP \n", + "21 2.5 254829 254 89.40 Trankit-L \n", + "\n", + " corpus year \n", + "0 et_edt 2017 \n", + "1 et_edt 2018 \n", + "2 et_edt 2018 \n", + "3 et_edt 2020 \n", + "4 et_edt 2021 \n", + "5 et_edt 2018 \n", + "6 et_edt 2018 \n", + "7 et_edt 2022 \n", + "8 et_edt 2019 \n", + "9 et_edt 2021 \n", + "10 et_edt 2021 \n", + "11 en_ewt 2017 \n", + "12 en_ewt 2018 \n", + "13 en_ewt 2018 \n", + "14 en_ewt 2020 \n", + "15 en_ewt 2021 \n", + "16 en_ewt 2018 \n", + "17 en_ewt 2018 \n", + "18 en_ewt 2022 \n", + "19 en_ewt 2019 \n", + "20 en_ewt 2021 \n", + "21 en_ewt 2021 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = {\n", + "'ud_version': [2.0, 2.2, 2.2, 2.5, 2.8, 2.2, 2.2, 2.10, 2.3, 2.7, 2.5]+\\\n", + " [2.0, 2.2, 2.2, 2.5, 2.8, 2.2, 2.2, 2.10, 2.3, 2.7, 2.5],\n", + "# exact corpus size (in words)\n", + "'exact_corpus_size':\n", + " [34628, 366351, 366351, 437769, 437767, 366351, 366351, 437850, 12032309, 437769+56399, 437769]+\\\n", + " [229753, 254829, 254829, 254829, 254830, 254829, 254829, 254821, 12032309, 254829+113385, 254829], \n", + "# total corpus size (in thousands of words)\n", + "'corpus_size': [ 34, 366, 366, 437, 437, 366, 366, 437, 12032, 437+56, 437]+\\\n", + " [229, 254, 254, 254, 254, 254, 254, 254, 12032, 254+113, 254], \n", + "'LAS': [71.65, 85.35, 83.84, 83.82, 84.43, 75.02, 83.26, 86.16, 86.67, 89.52, 89.52]+\\\n", + " [82.23, 84.57, 83.87, 83.59, 84.91, 77.56, 82.51, 88.10, 88.50, 90.27, 89.40],\n", + "'parser': ['Stanford', 'HIT-SCIR', 'StanfordNLP', 'Stanza v1.0', 'Stanza v1.3', 'UDPipe 1.2', 'UDPipe 2 prototype', 'UDPipe 2', 'Udify v1.0', 'RobertNLP', 'Trankit-L']+\\\n", + " ['Stanford', 'HIT-SCIR', 'StanfordNLP', 'Stanza v1.0', 'Stanza v1.3', 'UDPipe 1.2', 'UDPipe 2 prototype', 'UDPipe 2', 'Udify v1.0', 'RobertNLP', 'Trankit-L'],\n", + "# corpus name\n", + "'corpus': ['et_edt', 'et_edt', 'et_edt', 'et_edt', 'et_edt', 'et_edt', 'et_edt', 'et_edt', 'et_edt', 'et_edt', 'et_edt']+\\\n", + " ['en_ewt', 'en_ewt', 'en_ewt', 'en_ewt', 'en_ewt', 'en_ewt', 'en_ewt', 'en_ewt', 'en_ewt', 'en_ewt', 'en_ewt'],\n", + "'year': [2017, 2018, 2018, 2020, 2021, 2018, 2018, 2022, 2019, 2021, 2021]+\\\n", + " [2017, 2018, 2018, 2020, 2021, 2018, 2018, 2022, 2019, 2021, 2021],\n", + "}\n", + "results = pd.DataFrame.from_dict(data)\n", + "results" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b3500c8d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ud_versionexact_corpus_sizecorpus_sizeLASparsercorpusyearLAS_lowerLAS_upper
02.0346283471.65Stanfordet_edt201769.3473.96
12.236635136685.35HIT-SCIRet_edt201884.6486.06
22.236635136683.84StanfordNLPet_edt201883.1384.55
32.543776943783.82Stanza v1.0et_edt202083.1784.47
42.843776743784.43Stanza v1.3et_edt202183.7885.08
52.236635136675.02UDPipe 1.2et_edt201874.3175.73
62.236635136683.26UDPipe 2 prototypeet_edt201882.5583.97
72.143785043786.16UDPipe 2et_edt202285.5186.81
82.3120323091203286.67Udify v1.0et_edt201986.5586.79
92.749416849389.52RobertNLPet_edt202188.9190.13
102.543776943789.52Trankit-Let_edt202188.8790.17
112.022975322982.23Stanforden_ewt201781.3383.13
122.225482925484.57HIT-SCIRen_ewt201883.7285.42
132.225482925483.87StanfordNLPen_ewt201883.0284.72
142.525482925483.59Stanza v1.0en_ewt202082.7484.44
152.825483025484.91Stanza v1.3en_ewt202184.0685.76
162.225482925477.56UDPipe 1.2en_ewt201876.7178.41
172.225482925482.51UDPipe 2 prototypeen_ewt201881.6683.36
182.125482125488.10UDPipe 2en_ewt202287.2588.95
192.3120323091203288.50Udify v1.0en_ewt201988.3888.62
202.736821436790.27RobertNLPen_ewt202189.5690.98
212.525482925489.40Trankit-Len_ewt202188.5590.25
\n", + "
" + ], + "text/plain": [ + " ud_version exact_corpus_size corpus_size LAS parser \n", + "0 2.0 34628 34 71.65 Stanford \\\n", + "1 2.2 366351 366 85.35 HIT-SCIR \n", + "2 2.2 366351 366 83.84 StanfordNLP \n", + "3 2.5 437769 437 83.82 Stanza v1.0 \n", + "4 2.8 437767 437 84.43 Stanza v1.3 \n", + "5 2.2 366351 366 75.02 UDPipe 1.2 \n", + "6 2.2 366351 366 83.26 UDPipe 2 prototype \n", + "7 2.1 437850 437 86.16 UDPipe 2 \n", + "8 2.3 12032309 12032 86.67 Udify v1.0 \n", + "9 2.7 494168 493 89.52 RobertNLP \n", + "10 2.5 437769 437 89.52 Trankit-L \n", + "11 2.0 229753 229 82.23 Stanford \n", + "12 2.2 254829 254 84.57 HIT-SCIR \n", + "13 2.2 254829 254 83.87 StanfordNLP \n", + "14 2.5 254829 254 83.59 Stanza v1.0 \n", + "15 2.8 254830 254 84.91 Stanza v1.3 \n", + "16 2.2 254829 254 77.56 UDPipe 1.2 \n", + "17 2.2 254829 254 82.51 UDPipe 2 prototype \n", + "18 2.1 254821 254 88.10 UDPipe 2 \n", + "19 2.3 12032309 12032 88.50 Udify v1.0 \n", + "20 2.7 368214 367 90.27 RobertNLP \n", + "21 2.5 254829 254 89.40 Trankit-L \n", + "\n", + " corpus year LAS_lower LAS_upper \n", + "0 et_edt 2017 69.34 73.96 \n", + "1 et_edt 2018 84.64 86.06 \n", + "2 et_edt 2018 83.13 84.55 \n", + "3 et_edt 2020 83.17 84.47 \n", + "4 et_edt 2021 83.78 85.08 \n", + "5 et_edt 2018 74.31 75.73 \n", + "6 et_edt 2018 82.55 83.97 \n", + "7 et_edt 2022 85.51 86.81 \n", + "8 et_edt 2019 86.55 86.79 \n", + "9 et_edt 2021 88.91 90.13 \n", + "10 et_edt 2021 88.87 90.17 \n", + "11 en_ewt 2017 81.33 83.13 \n", + "12 en_ewt 2018 83.72 85.42 \n", + "13 en_ewt 2018 83.02 84.72 \n", + "14 en_ewt 2020 82.74 84.44 \n", + "15 en_ewt 2021 84.06 85.76 \n", + "16 en_ewt 2018 76.71 78.41 \n", + "17 en_ewt 2018 81.66 83.36 \n", + "18 en_ewt 2022 87.25 88.95 \n", + "19 en_ewt 2019 88.38 88.62 \n", + "20 en_ewt 2021 89.56 90.98 \n", + "21 en_ewt 2021 88.55 90.25 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import math\n", + "\n", + "# Calculates Hoeffding's bound. \n", + "# Parameters:\n", + "# n -- evaluation set size in words\n", + "# alpha -- confidence level is (1 - alpha)\n", + "def hoeffding_bounds(n, alpha=0.05):\n", + " return math.sqrt( (1/(2*n)*math.log(2/(alpha))) )\n", + "\n", + "# Add Hoeffding bounds confidence intervals to each LAS in table\n", + "def add_hoeffding_bound_conf_intervals(dframe, test_proportion=0.1):\n", + " # Initialize new columns\n", + " dframe['LAS_lower'] = [0.0]*len(dframe)\n", + " dframe['LAS_upper'] = [0.0]*len(dframe)\n", + " # Calculate conf intervals for each LAS\n", + " for ind in dframe.index:\n", + " LAS = dframe['LAS'][ind]\n", + " corpus_size = dframe['exact_corpus_size'][ind]\n", + " test_corpus_size = corpus_size*test_proportion\n", + " hfb = hoeffding_bounds(test_corpus_size) * 100.0\n", + " dframe.loc[ind,'LAS_lower'] = round(LAS - hfb, 2)\n", + " dframe.loc[ind,'LAS_upper'] = round(LAS + hfb, 2)\n", + " return dframe\n", + "\n", + "add_hoeffding_bound_conf_intervals(results)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "897a20ce", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "p = ggplot(results) + theme_bw()\n", + "p = p + theme(legend_title=element_text(size=10), figure_size=(10, 4),)\n", + "p = p + facet_wrap(['corpus'], nrow = 1, labeller={'et_edt':'et_edt (Estonian)', 'en_ewt':'en_ewt (English)'})\n", + "p = p + geom_point(aes(x='year', y='LAS', color='corpus', size='corpus_size'), show_legend={'size': True})\n", + "# Stanza's 95% conf intervals on EN\n", + "p = p + geom_step(aes(x='year', y='LAS_upper'), \n", + " data=results[(results['corpus']=='en_ewt') & results['parser'].str.contains('Stan')], \n", + " linetype='dashed', color='red')\n", + "p = p + geom_step(aes(x='year', y='LAS_lower'), \n", + " data=results[(results['corpus']=='en_ewt') & results['parser'].str.contains('Stan')], \n", + " linetype='dashed', color='red')\n", + "# Stanza's 95% conf intervals on ET\n", + "p = p + geom_step(aes(x='year', y='LAS_upper'), \n", + " data=results[(results['corpus']=='et_edt') & results['parser'].str.contains('Stan')], \n", + " linetype='dashed', color='blue')\n", + "p = p + geom_step(aes(x='year', y='LAS_lower'), \n", + " data=results[(results['corpus']=='et_edt') & results['parser'].str.contains('Stan')], \n", + " linetype='dashed', color='blue')\n", + "# Complete conf interval lines (extend from 2021 to 2022)\n", + "last_en_ewt_stanza = \\\n", + " (results[(results['corpus']=='en_ewt') & results['parser'].str.contains('Stan')].tail(1)).copy()\n", + "last_en_ewt_stanza = pd.concat([last_en_ewt_stanza, last_en_ewt_stanza], ignore_index=True)\n", + "last_en_ewt_stanza.loc[1, 'year'] = 2022\n", + "last_et_edt_stanza = \\\n", + " (results[(results['corpus']=='et_edt') & results['parser'].str.contains('Stan')].tail(1)).copy()\n", + "last_et_edt_stanza = pd.concat([last_et_edt_stanza, last_et_edt_stanza], ignore_index=True)\n", + "last_et_edt_stanza.loc[1, 'year'] = 2022\n", + "p = p + geom_line(aes(x='year', y='LAS_upper'), \n", + " data=last_en_ewt_stanza, \n", + " linetype='dashed', color='red')\n", + "p = p + geom_line(aes(x='year', y='LAS_lower'), \n", + " data=last_en_ewt_stanza, \n", + " linetype='dashed', color='red')\n", + "p = p + geom_line(aes(x='year', y='LAS_upper'), \n", + " data=last_et_edt_stanza, \n", + " linetype='dashed', color='blue')\n", + "p = p + geom_line(aes(x='year', y='LAS_lower'), \n", + " data=last_et_edt_stanza, \n", + " linetype='dashed', color='blue')\n", + "# Override default colors\n", + "p = p + scale_color_manual(values = {'et_edt': 'blue', 'en_ewt': 'red'}, guide=False)\n", + "# Add adjusted labels\n", + "p = p + geom_text(aes(x='year', y='LAS', label=\"parser\"), data=results[results['corpus']=='et_edt'], size=8, \n", + " adjust_text={'arrowprops': {'arrowstyle': '-'}} )\n", + "p = p + geom_text(aes(x='year', y='LAS', label=\"parser\"), data=results[results['corpus']=='en_ewt'], size=8, \n", + " adjust_text={'arrowprops': {'arrowstyle': '-'}} )\n", + "# Override labels\n", + "p = p + labs(size = \"corpus size (thousands of words)\")\n", + "p = p + theme(legend_position='bottom', legend_box_spacing=0.25)\n", + "display(p)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "44524d40", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Programmid\\Miniconda3\\envs\\py38_estnltk_neural\\lib\\site-packages\\plotnine\\ggplot.py:718: PlotnineWarning: Saving 10 x 4 in image.\n", + "C:\\Programmid\\Miniconda3\\envs\\py38_estnltk_neural\\lib\\site-packages\\plotnine\\ggplot.py:719: PlotnineWarning: Filename: figure_reported_las_vs_years.pdf\n" + ] + } + ], + "source": [ + "p.save('figure_reported_las_vs_years.pdf')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/background_data/edt_punct_tokens.txt b/background_data/edt_punct_tokens.txt new file mode 100644 index 00000000..4212af0b --- /dev/null +++ b/background_data/edt_punct_tokens.txt @@ -0,0 +1,35 @@ +! +!!! +" +( +) +* ++ +, +,, +- +-- +-. +. +.. +... +.... +/ +: +; +< += +> +? +?? +????? +?????????? +????????????? +[ +] +oh-ma-olen-nii-hoolitsev-ja-seksika-hingeeluga +proto.groovetech.com/mk-demf.rxml?file=/demf/mainstage/day2/mainstageday2-kennydixonjr.rm +– +— +“ +” diff --git a/background_data/readme.md b/background_data/readme.md new file mode 100644 index 00000000..fe476b46 --- /dev/null +++ b/background_data/readme.md @@ -0,0 +1,4 @@ +# Background data used in experiments + +* `edt_punct_tokens.txt` -- all punctuation tokens of the EDT-2.6 corpus. Automatically aquired, searching for tokens with `xpos == 'Z'`; +* `visl_lemmas.txt` -- lemmas extracted from [Estonian Constraint Grammar](https://github.com/EstSyntax/EstCG) ruleset; \ No newline at end of file diff --git a/background_data/visl_lemmas.txt b/background_data/visl_lemmas.txt new file mode 100644 index 00000000..b4732596 --- /dev/null +++ b/background_data/visl_lemmas.txt @@ -0,0 +1,6508 @@ +(.*)\\. +[0-9]++(.*) +$1 +aamen +äärde +ääres +äärest +aaria +äärista +aasi +aasta +(.*)_aasta +Aasta +aastakümme +aasta_kümme +aastane +aasta=ne +aasta_päev +aasta_sajand +ab +AB +abi +abiellu +abi_ellu +abi_elluma +abi_elu +abikaasa +abi_kaasa +abil +abista +absorbeeru +abstraheeri +abstraheeru +Academia +adapteeru +admiral +adsorbeeru +adu +ae +aed +(.*)_aed +aeg +aega +aegajalt +aeg_ajalt +aegle +aeglustu +aegu +aele +äesta +afeel +aga +äga +ägetse +ägise +aglutineeru +ah +ahasta +ahata +ahelda +aheldu +ahene +aheta +ahheta +ähise +ahista +ahki +ähki +ähkle +ahku +ähma=ne +ähmastu +ähmene +ähmi +ähmu +ahnitse +ahtru +ahtu +ähvarda +ähvardus +ahvatle +ahvi +aiata +aida +aida=nu +aidanud +aida=nud +aieta +äiga +aim +aima +aimu +aina +aine +ainult +aisti +aita +aitama +aja +ajal +aja_leht +Aja_leht +ajalukk +aja_lukk +ajatele +ajatle +aja_vahemik +ajel +ajendu +aju +akadeemia +Akadeemia +akadeemik +akommodeeru +akompaneeri +aktiivistu +aktiviseeru +aktualiseeru +akumuleeru +akvaarium +al +alaarene +ala_ealine +alahinda +alanda +alandu +alane +alarmeeri +alates +alati +alevik +Alevik +alga +algul +algult +algus +alista +alistu +alkohol +all +alla +allapoole +alla_poole +alles +allkiri +all_kiri +allpool +all_pool +allu +alluta +alpi +alt +altpoolt +alt_poolt +alumine +alune +alus +alusta +alustu +ämber +Ameerika +ameerikastu +amele +amerikaniseeru +ameti_ala=ne +ametiala=ne +ametle +ammenda +ammendu +ammu +ammu=ne +amüseeri +an +anasta +and +andeks +andesta +andma +andu +ankurdu +annihileeru +ansambel +(.*)_ansambel +Ansambel +antav +an=tu +antud +an=tud +anu +äparda +äpardu +apelleeri +apelsinid +aplodeeri +aprill +aprillikuu +aprilli_kuu +ära +arabiseeru +arastu +ärata +arbu +arbuta +arenda +arene +ärge +ärgem +argipäevastu +argistu +ärgita +ärgitus +ärgu +argumenteeri +äri +äritse +ärka +armasta +armatse +armistu +armu +armusta +ärple +ärrita +ärritu +arst +(.*)arst +arsti +artikkel +aru +aruta +arutle +arv +arva +arvamus +arvel +arvele +arvelt +arvesta +arvukalt +arvusta +arvuta +as +AS +asemel +asemele +asenda +asendama +asendamine +asenda=mine +asenda_mine +asendu +asene +aseta +asetse +asetu +asi +asja_koha=ne +asjastu +askelda +assimileeru +ässita +assotsieeru +aste +astu +asu +asune +asusta +asuta +asutus +(.*)_asutus +Asutus +atakeeri +atrofeeru +august +aura +auru +aurustu +auskulteeri +austa +austatud +au_tasusta +automatiseeru +ava +avakueeru +avalda +avaldu +avameeldu +avameelitse +avane +avansseeri +avarda +avardu +avasta +avastu +([a-z]++)=(.*) +([a-z]++)_(.*) +\?[a-z]++ +([a-z]++)aasta=ne(.*) +([a-z]++)=ne +([a-z]++)päeva=ne(.*) +balansseeri +balti +Balti +band +(.*)_band +Band +barbariseeru +barrikadeeri +baseeru +bassein +blaseeru +blokeeri +blokeeru +boheemitse +boikoteeri +bols^eviseeru +braveeri +bravuuritse +broneeri +bürokraadistu +bürokratiseeru +ca +comes + cond) ( +Cpr +dateeri +dateeru +de +(.*)de +debateeri +debüteeri +defileeri +defineeri +deformeeru +degenereeru +degradeeru +dehumaniseeru +Dekaad +deklameeri +deklareeri +deklasseeru +deklineeru +dekvalifitseeru +demobiliseeru +demokratiseeru +demoraliseeru +denatureeru +desarmeeru +deserteeri +deserteeru +desinfitseeri +desorganiseeru +desorienteeri +desorienteeru +destabiliseeru +detailiseeri +detsember +detsentraliseeru +detsimeetrine +devalveeru +devieeri +diafragmeeri +diagnoosi +diferentseeru +ðifreeri +diftongeeru +difundeeru +dikteeri +direktiiv +direktor +disharmoneeru +diskrimineeri +diskuteeri +dislotseeru +dispergeeru +disputeeri +dissoneeri +dissotsieeru +distantseeru +distsiplineeru +divisjon +(.*)_divisjon +Divisjon +dogmatiseeru +doktor +doktoreeru +donna +dotsent +dr +drapeeru +(.*)dud +duelleri +duelli +(.*)e +ebale +eba_määra=ne +ebardu +ebele +edasi +Edasi +edenda +edene +edesta +edu +eduta +edvista +eel +eelda +eeli +eelista +eelistus +eelmine +eelne +eemalda +eemaldu +eemale +ees +eesmärk +ees_märk +ees_pool +eest +eesti +Eesti +eesti_maa +Eestimaa +Eesti_maa +eestistu +eest_poolt +efektitse +ega +ehita +ehk +ehkki +ehmata +ehmu +ehti +ei +eile +einesta +eineta +eita +eks +eksalteeru +eksi +eksisteeri +eksita +ekskureeri +ekspatrieeru +eksperimenteeri +ekspluateeri +ekstraheeru +ela +elata +elatu +elavne +elberda +elektriseeru +elik +elu +elu_aeg +Elu_aeg +elu_koht +elune +elustu +elutse +ema +ema=ne +emantsipeeru +emba +embkumb +emb_kumb +emb-kumb +emigreeru +enam +enamasti +enamik +enamus +end +enim +enne +ennusta +ent +eostu +eputa +eralda +eraldu +erane +erenda +ereta +erga +ergastu +erguta +eri_ala=ne +eriala=ne +erine +erista +eristu +erita +eriti +erku +erodeeru +eruta +erutu +esemestu +esi +esildu +esile +esile_kutsutud +esimees +esi_mees +esimene +esi_naine +esinda +esine +esiteks +esitle +esitus +eskorti +esmaspäev +esmas_päev +esteeditse +Estonia +et +etapp +etendu +ette +ette_panek +ette_poole +(.*)_ette_võte +ette_võte +Ette_võte +euro=ne +euroopa +Euroopa +euroopastu +evakueeri +evolutsioneeru +fassaaditse +faulitse +femineeru +feminiseeru +feodaliseeru +fikseeri +fikseeru +filosofeeri +filosoof +filosoofitse +filtreeru +finisheeri +firma +flirti +flööti +fluktueeri +föderatsioon +(.*)_föderatsioon +Föderatsioon +födereeru +folkloor +(.*)_folkloor +folkloriseeru +formaat +formeeri +formeeru +foto +fotograaf +fraasitse +funktsioneeri +gaasistu +galopeeri +garanteeri +gastroleeri +geminaadistu +gemineeru +generaliseeru +gleistu +grammine +gratuleeri +graveeri +graviteeri +grimassita +grupeeru +grüüneta +gümnaasium +(.*)_gümnaasium +Gümnaasium +hääbu +haaki +haaku +häälda +hääldu +häälesta +häälestu +hääleta +häälitse +haara +haarama +häärata +haardu +haarle +haava +haaval +haavandu +haavu +habenda +häbene +habeta +habetu +häbi +habise +habrastu +hädalda +hädamaandu +hädatse +hägune +hägustu +haheta +hahku +haigene +haigestu +haigla +(.*)_haigla +haigla=ne +haigle +haiguta +haihtu +häilata +häiri +häiru +haise +haist +haju +hakka +hakkama +hakki +hakul +hala +halasta +halb +hälbi +halda +haleta +haletse +halise +haljastu +haljenda +halju +hallata +hallenda +halline +hallistu +hallita +haltuuratse +halva +halvene +hälvita +halvusta +hämardu +hämarne +hambu +hämmasta +hämmastu +hämmelda +hämmeldu +hammusta +hammustama +hämuta +hangelda +hängelda +hangu +hanki +hansa +Hansa +hapu_piim +härdu +hargne +hari +härise +harju +harjuta +harku +härmatu +härmistu +harmoneeri +harmoneeru +harpa +härra +harrasta +härratse +harune +haruta +harvene +hasseta +hästi +hatuta +haudu +hauduma +hauku +hauta +hävi +hävine +hea +heaks +heameelitse +heiastu +heikle +heilu +hein +heisku +heit +heitle +heitma +heitu +hektar +hela +heldi +heldu +helenda +helestu +helga +helgahta +helgata +helgeta +helgu +helise +helista +helju +helki +helkle +helleniseeru +hellutse +helmenda +helvenda +helvestu +helveta +hepita +hetk +hibise +higista +higistu +higitse +hiilga +hiili +Hiina +hiinastu +hiirle +hiiu_maa +Hiiumaa +Hiiu_maa +hiiva +hikita +hilbenda +hilberda +hiline +hili=ne +hilistu +hilja +hiljaks +hiljem +hilju +hilpa +hilple +hilpne +hilpsa +hilpu +himu +himurdu +himusta +himutse +hinda +hing +hinga +hingasta +hingelda +hingestu +hingi +hinguta +hinku +hinna +hinnanud +hinna=nud +hinnatud +hirm +hirmu +hirnahta +hirnata +hirska +hõbeda=ne +hõbele +hõbenda +hõbene +hõbestu +hõbetu +hoiata +hoid +hoidma +hoidu +hõigata +hõika +hõikle +hõila +hõiska +hõlbustu +hõlju +holpsa +holpsata +homme +hommik +hommikune +hommiku=ne +hommiku_poole +hõngu +hoogle +hõõgu +hoogustu +hool +hoolda +hooli +hoolimata +hoolitse +hooma +hoople +hõõrdu +hööri +höörita +höörle +hõõru +hoova +hopsa +hõrene +hõrestu +hõrgutse +hubise +hübridiseeru +hüdise +hüdratiseeru +hüdrolüüsu +hühise +huigata +huikle +huilata +huiluta +huiska +hukka +hukku +hulgaliselt +hulgas +hulgast +huligaanitse +hulju +hulk +hulka +hulku +hulla +hullu +hullutse +hülpa +hulpi +hülpi +hulple +humise +hüpe +hüple +hüppa +hurmu +hurraata +hüüata +hüübi +hüüd +huuga +hüürga +huvi +huvile +huvita +(.*)_huvita +hüvita +huvitav +huvitu +idane +(.*)ide +idee +identifitseeru +idu +idüllitse +idutse +iga +igane +iga_päeva=ne +iga_sugune +igati +igatse +igatsus +igaüks +iga_üks +igavik +igavle +ignoreeri +igritse +iha +ihalda +ihale +ihalus +ihastu +ihka +ihku +ihnu +ihnutse +ihtu +iiba +iili +iilitse +(.*)ik +ikaldu +ikka +ilatse +ilge +ilgelt +ilku +illumineeri +illustreeri +ilma +ilmesta +ilmlikustu +ilmne +ilmselt +ilmsiks +ilmu +ilmuta +ilmutu +ilusta +ilutse +iluuisuta +iluvõimle +ima +imbu +ime +imendu +imesta +imestu +imetele +imetle +immatrikuleeru +immigreeri +immigreeru +immitse +immuta +imponeeri +importi +ina +inaktiveeru +individuaalitse +industrialiseeru +infiltreeru +infitseeru +informeeri +inimene +inimestu +innusta +innustu +inspireeri +inspireeru +instituut +(.*)_instituut +Instituut +integreeru +intensiivista +intensiivistu +internatsionaliseeru +interpelleeri +intriigitse +invaliidistu +ioniseeru +irdu +irgitse +iriseeri +ironiseeri +iroonitse +irriteeri +irska +irvenda +irvita +(.*)is +isa +isa=ne +ise +iseäratse +iseenese +ise_enese +isegi +iseloomusta +isemeelitse +iseseisvu +iseseisvumine +ise_seisvu=mine +ise_sugune +isetolmle +isoleeri +isoleeru +isomeriseeru +issand +(.*)ist +istu +istuli +istung +Istung +istuskele +istuskle +istuta +isu +isune +isutse +itsita +ivake +[I|V|X|C|M]++(.*) +ja +(.*)ja +(.+)ja +jaa +jää +jäädvusta +jäädvustu +jaam +(.*)_jaam +Jaam +jääma +jaanuar +jäätu +jaburda +jaburdu +jädendu +jaga +jagele +jagu +jagune +jah +jahenda +jahene +jahmerda +jahtu +jahuta +jahvata +jäigastu +jäiku +jalastu +jälesta +jalg +jälg +jalga +jälgi +jälile +jälita +jäljenda +jalule +jaluta +jalutele +jalutle +jama +jämedune +jamele +jämene +jämetse +jammi +jampsi +janda +jända +jandita +jankle +jänni +janti +jantle +janune +jaoks +jaole +jaota +jaotu +järel +järelda +järeldu +järele +järelikult +järelkääri +järelkuiva +järelküpse +järelt +järelvalmi +järg +järgi +järgmine +järgne +järjest +järjesta +järjestu +järk +järsene +järv +(.*)jas +jäsele +jät +jätka +jätku +jättu +jauna +jaura +ja/või +joba +jobise +jobuta +jõgi +jõhkrutse +johtu +joigu +jokki +jokuta +jõla +jõletse +jõlki +jõlku +jõllita +jõllitama +jomise +jommi +jommita +jõmpsu +jõngerda +jõnki +jonkle +jõnkle +jõnksa +jonksata +jõnksata +jonksle +jõnksle +jonksu +jõnksu +jõnku +jonle +jontsi +jöntsi +jõntsi +joo +jõõbita +joobu +jooks +jooksle +jooksul +jooma +joon +joonestu +joonis +joonista +jõõrdu +joot +joovastu +joovatse +jortsata +jõu +jõud +jõudu +jõukle +jõul +jõustu +jõuta +jõutama +jt +ju +juba +jubetse +judi +judise +judista +juha +juhata +juhataja +(.*)juhataja +juhenda +juhendaja +juhindu +juhitav +juhmistu +juhmu +(.*)juht +juhti +juhtiv +juhtu +jukerda +julge +julgus +julgusta +julmutse +(.*)jumal +jumalda +jumestu +junna +jupasta +juperda +jupsi +just +justkui +just_kui +jutle +jutusta +juubelda +juuli +juulikuu +juuli_kuu +juuni +juunikuu +juuni_kuu +juurde +juurdu +juures +juurest +juuri +juustundu +ka +kaaberda +kääbustu +kääksata +kääksu +kaalu +kaamene +kaamestu +kääna +käändu +kaanetu +käänle +kaapa +kaapi +kaardu +käärdu +kääri +kaarle +käärle +kaasa +kaasajastu +kaasas +kaasle +kaasne +kaasu +kaatsa +käätsata +kaatsi +käätsu +kabestu +kabeta +kabise +kadesta +kädise +kadu +kae +kaeba +kaeble +kael +kaela +kaelas +kaelast +kaelusta +käe_pära=ne +käes +käest +kaeva +kaevandu +kaeva=nu +kaevanud +kaeva=nud +kaevu +kägardu +kägise +kaha +kahanda +kahane +kahardu +kähardu +kahekordistu +kahendik +kahendu +kahestu +kahetse +kahi +kahise +kähise +kahjatse +kahju +kahjurõõmutse +kahjustu +kahk +kahku +kahla +kahmeta +kahmi +kahtle +kahtlemata +kahtlusta +kahtu +kahuta +kahvata +kahvatu +käi +käia +käiama +käibi +käigus +kaiku +käima +kainenda +kainene +kaits +kaitse +kaitsema +kaitsma +käitu +käivitu +kaja +kajasta +kajastu +käkerdu +kakle +käkru +kaks +kaksita +käksu +kala +kalasta +kaldu +kalestu +kalgastu +kalge +kalgendu +kalgine +kalgistu +kalla +kallal +kallale +kallalt +kallerda +kallerdu +kalline +kallista +kalluta +kallutu +kalpa +kalpsa +kaltsa +kaltserda +kamakas +kamanda +kamardu +kamp +kampa +kämpu +kan +kanal +(.*)_kanal +Kanal +kand +(.*)kandidaat +kandideeri +kandis +kandu +kangastu +kange +kangene +kangestu +kängitse +kangu +kängu +kanguta +kannata +känni +kannul +kannule +kannult +kanoniseeru +kanti +(.*)kantsler +käntsu +kaos +kaota +kaotsi +kaperda +kapitaliseeru +kapituleeru +kappa +kapriisitse +kapsastu +kapseldu +kapsu +kapten +karamellistu +karamellu +karasta +karastu +kärata +käratse +karbata +kärbata +kärbu +karestu +kareta +karga +kärgahta +kärgata +kargle +kargu +kärgu +karguta +kari +kärise +karista +kärista +karja=ne +karjatu +karju +karjuta +kärki +kärmasta +kärmene +karmine +karmistu +karmu +kärnata +kärpi +kärsi +kärsiskle +kärsitse +kärssa +kärssu +karstu +kart +kärtsa +kärtsata +kärtsu +kartus +käruta +karv +kärva +karva=ne +karvastu +karvenda +karvusta +kas +käsi +käsil +käsile +käsita +käsitle +käsk +käski +käsnata +käsnu +kast +kastu +kasu +kasuta +käsuta +kasv +kasva +kasvata +kasvataja +kat +katapulteeru +katke +kätke +katkesta +katkestu +katki +katku +kätsata +katse +katseta +katsu +kätte +kattu +kaua +kauaks +kaubitse +kaudu +kauem +kaugel +kaugele +kaugenda +kaugene +kaugu +kaugus +kaugusel +kauguselt +kauguse=lt +käuksata +käuksu +kaunine +kaunis +kaunista +kaunitse +käunu +kaupa +kauple +kava +kavalda +kavanda +kavatse +kavatsus +kebi +kee +keegi +keel +(.*)_keel +keela +keeld +keelita +keepa +keera +keerdu +keerle +keerustu +keeruta +keerutu +keet +keevatse +keevitu +kehane +kehasta +kehastu +kehita +kehki +kehkle +kehtesta +kehtestu +kehti +kehvene +kehvu +keiser +kekerda +kekka +keksa +keksi +keksle +kekuta +kelguta +kelka +kelki +kell +kella +kemple +kena +kenastu +kenita +kenitle +kenka +kenksa +kepelda +kepsa +kepsle +kerge +kergene +kergita +keri +kerki +kes +keset +kesk +keskel +keskele +keskelt +keskenda +keskendu +keskmine +kesk_pära=ne +keskus +(.*)_keskus +Keskus +keskustele +keskustle +kest +kesta +kestel +kestenda +ketenda +ketistu +ketra +ketuta +kevad +kiba +kibene +kibestu +kibise +kibrastu +kibruta +kidele +kidise +kidu +kidune +kidutse +kigalda +kigatse +kigise +kiha +kihardu +kihele +kiherda +kiherdu +kihine +kihise +kihistu +kihitu +kihkle +kihku +kihlu +kiht +kihuta +kihvata +kihvatu +kiiguta +kiikle +kiiksata +kiiksu +kiiksuta +kiiku +kiila +kiilane +kiilastu +kiildu +kiilga +kiilu +kiiluma +kiimatu +kiimelda +kiimle +kiindu +kiirata +kiire +kiirenda +kiirene +kiiresti +kiirga +kiirgu +kiirita +kiiritu +kiirusta +kiirustle +kiirutse +kiiska +kiit +kiitle +kiitma +kiitsa +kiitsata +kiitsu +kiivastu +kikitu +kila +kilahta +kilatse +kilda +kildu +kilenda +kilgenda +kilise +kiljata +kiljatle +kilju +kilka +kilksata +kilksu +kilku +killas +killast +killenda +killu~ke +killune +killusta +killustu +kilo +kilogrammine +kilomeeter +kilo_meeter +kima +kimardu +kimbuta +kimele +kimp +kimple +kimpu +kindlusta +kindlustu +kindral +kindu +king +kinki +kinni +kinnista +kinnistu +kinnita +kinnitu +kiperda +kipitele +kipitle +kipitse +kippa +kippu +kipra +kipru +kira +kirata +kiratse +kirenda +kireta +kirga +kirgastu +kirgle +kirgu +kiri +(.*)_kiri +Kiri +kirik +Kirik +kirise +kirista +kirja +(.*)_kirjandus +kirjanik +kirjastus +(.*)_kirjastus +Kirjastus +kirjelda +kirjuta +kirjutama +kirka +kirma +kirmenda +kirmeta +kirmetu +kirtsuta +kiru +kirurg +kirvelda +kirvenda +kisa +kisenda +kiskle +kisku +kissi +kissitu +kitsam +kitsene +kitsine +kitsu +kitsune +kiuhka +kiuksata +kiuksu +kiunahta +kiunata +kiunu +kiunuta +kiusa +kiusle +kiuste +kiutsu +(.*)kivi +kivine +kivista +kivistu +klaari +klaaru +klaas +klaasistu +klabise +klähma +klähvata +klähvi +klammerdu +klani +klappi +klapsata +klapsu +klass +kleebi +kleebi=tu +kleebitud +kleebi=tud +kleepi +kleepu +(.*)_kliinik +kliisterda +kliisterdu +klirise +klobata +klõbata +klobise +klõbise +klõbista +klõgise +klõksata +kloksu +klõksu +klõmata +klõmmu +klompsata +klõmpsata +klompsu +klõmpsu +klompu +kloppi +klopsata +klopsi +klubi +(.*)_klubi +Klubi +klugise +kluguta +kluksata +kluksu +kluuguta +kluuksata +kluuksu +kluuta +km2 +km3 +koaguleeru +koaleeru +koba +kobardu +kobene +kobestu +kobi +kõbi +kõbise +kõbista +(.*)_koda +kodakondsus +kodanik +kodanlikustu +kodanlustu +kodi +kodja +kodu +kõdu +kodune +kodu=ne +kõdune +kodus +kodust +kodustu +koeksisteeri +(.*)koer +koerusta +koerutse +koge +kogelda +kogemata +kogise +kõgise +kogu +(.*)_kogu +(.+)_kogu +kogune +kohahta +kohal +kohalda +kohale +kohalt +kohanda +kohandu +kohane +kohaselt +kohastu +kohata +kohe +kõhele +kohenda +kohendu +kohene +kõhetu +kohise +köhise +kõhise +kohka +kõhkle +kõhklus +kohku +kohla +kohma +köhmi +köhmitse +kõhmitse +kohmu +kõhnene +kõhnitse +kõhnu +kohra +kohru +koht +kohta +kohtle +kohtu +kohtustus +kõhulda +kõhuli +kohusta +kohustu +kohustus +kohuta +kõhuta +kohv +koiba +koiberda +kõige +koigerda +kõiguta +kõik +kõiksugu +kõiku +koipa +koit +köit +koiva +koju +kökerda +kõketa +koketeeri +kokku +koksa +koksata +kõksata +koksi +koksistu +kola +kõla +kolahta +kõlahta +kolata +kõlata +kõlba +kõlestu +kolgastu +kõlguta +koli +kolise +kõlise +kolka +kolksa +kolksahta +kolksata +kõlksata +kolksu +kõlksu +kolku +kõlku +kolla +kollabeeru +(.*)_kollektiiv +kollektsioneeri +kollenda +kollerda +kolleta +kolletu +kolmandik +kolmapäev +kolma_päev +kolmas +kolmene +kolmestu +kolm_veerand +kõlpa +kolpsa +koltu +kõma +kõmahta +kõmata +kombe_koha=ne +kombel +komberda +kombineeri +kombineeru +kõmise +komisjon +(.*)_komisjon +Komisjon +komista +komistle +komitee +(.*)_komitee +Komitee +komme +(.*)_komme +kõmmeldu +kommenteeri +kõmmu +kõmmuta +kompa +kompareeru +kompenseeru +kompi +kõmpi +komple +kompleks +komplekt +komplekteeri +komplekteeru +komplitseeri +komplitseeru +komposteeru +kõmpsa +kompsata +kõmpsi +kompsu +kõmpsu +konardu +(.*)kond +(.*)_kond +konda +kondenseeru +kõndi +köndistu +kõnele +kõneta +konfereeri +konfiskeeri +konföderatsioon +(.*)_konföderatsioon +Konföderatsioon +konformeeru +kõnge +kongerda +kongrueeru +kõngu +konjugeeru +konka +könka +konkretiseeru +konksu +könksu +konksuta +kõnku +konkureeri +konkurss +(.*)_konkurss +könni +kõnni +kõnniskele +kõnniskle +konserveeru +konsolideeru +konspireeri +konstateeri +konstitueeru +konsulteeri +konta +könta +kontakteeru +kontamineeru +kontraheeru +kontrasteeru +kontrolli +kontsentreeru +kontserteeri +köntsi +kontsu +konuta +könuta +konverents +Konverents +konvergeeru +konvoeeri +kooberda +kööberda +kooderda +kooga +köögerda +kõõguta +kooka +kõõksata +kõõksu +kooku +kool +(.*)_kool +Kool +kooldu +koole +koolita +kõõlu +kooluta +kõõma +kõõmenda +kõõmeta +kõõmuta +koonda +(.*)_koondis +koondu +koonerda +köönerda +koonuta +kööpa +koopereeru +koordineeri +koordu +koori +koorma +kooru +koos +kooserda +kooskõlasta +kooskõlastu +koosne +koos_olek +Koos_olek +koosta +koostu +koosuta +kootsi +köötsuta +kootu +koperda +köperda +kopikaline +kopita +kopitse +kopra +kopru +köpru +kopsa +kopsata +kõpsata +kopsi +kopsu +kõpsu +kõra +korbastu +korbata +korbatu +kõrbe +kõrbestu +kõrbu +kord +korda +kordistu +korditu +kordu +kõrge +kõrgem +kõrgene +korgistu +kõrgistu +kõrgitse +kõrgu +kõrgune +kõrgu=ne +kõrgus +korise +körise +korista +korja +korju +korki +korpu +korraga +korral +korralda +korraldu +korraldus +korras +korrastu +korreleeru +korrodeeru +korrumpeeru +korsata +korska +kõrsu +kortsu +korv +kõrval +kõrvalda +kõrvaldu +kõrvale +kõrvalt +kõrveta +korvu +kõrvuta +kõrvutu +kõsise +koska +kössi +kössita +kossuta +kössuta +kõssuta +kost +kostita +kostu +kostümeeru +kosu +koterda +kotita +kotserda +kötsita +kott +kotti +kõtuta +kouki +kõvastu +kõvene +kõverdu +kõvetu +kraakle +krääksata +kraaksu +krääksu +krääksuta +kraamelda +kraami +kraapi +kraaple +krääsle +krabise +kragise +krägise +krahv +kraksata +kraksu +krampi +krample +krampu +krapsa +krapsata +kratsi +kräuksata +kräuksu +kräunata +kräunuta +krebi +kribele +kribise +krigise +kriiksata +kriiksu +kriimusta +kriipsuta +kriisata +kriitsu +krilli +krimpsu +krimpsuta +kripelda +kristalliseeru +kritiseeri +kritselda +kriuksata +kriuksu +kriuksuta +kriunuta +krõbene +krobestu +krobise +krõbise +krõgise +krohv +krõksata +krõksu +krompsu +krõmpsu +krõmpsuta +kronksu +kroobata +krooksata +krooksu +krooli +kroon +kroonine +krooni=ne +krõpsata +krõpsu +krudise +krutti +kruuguta +kruuksata +kruuksu +kruvi +kübe +kubise +küde +kudu +kügele +kugise +kuhi +kuhja +kuhju +kühmita +kühmu +kühmus +kühmuta +kuhtu +kuhu +kuhugi +kühvelda +kui +kuid +kuidas +kuigi +kuika +kui ka +kui_palju +kuiva +kuivata +kuivene +kuiveta +kuivetu +kuivu +kuju +kujul +kujunda +kujundu +kujune +kujuta +kujutis +kujutle +kujutlus +kujutu +kujutus +kükerda +kukil +kukile +kukilt +kükita +kükki +kukku +kukuta +küla +(.*)_küla +küla_tee +kulda +kuletu +kulge +külge +külgne +külgu +külili +kulise +küljes +küljest +kulksu +küll +küllalt +küllasta +küllastu +kullenda +külm +külma +külmene +külmeta +külmetu +kulmineeri +kulmineeru +külmu +külmuta +kultuuristu +kultuuritse +kulu +kulul +kuluta +külva +külvu +kuma +kumb +kumbki +kümble +kumenda +kumerdu +kumeta +kumise +kumm +kummardle +kummardu +kümmekond +kummu +kummuli +kümnendik +kumuleeru +kuna +kunagi +künd +kuni +(.*)_kunst +kupla +kuplu +kuppa +küpse +küpseta +kuraasita +kuraasitse +kurameeri +kurat +kurbu +kurdistu +kurdu +kurise +kurjusta +kurjutse +kurluuta +kurna +(.*)_kurna +kurnäuta +kurnitse +kurnu +kurrutu +kurseeri +kurt +kurtu +kurvasta +kurvastu +kurvatse +kus +kusagil +kusagile +kusagilt +kuse +kusele +küsi +küsitle +kuskil +kuskilt +kust +kusta +kustama +kustkaudu +kust_kaudu +kustu +kustune +kustuta +küt +kuterda +kütkestu +kutsu +küttu +kuu +(.*)_kuu +küübestu +kuuekordistu +kuuendik +kuuga +kuuguta +kuuka +kuuksu +kuul +kuula +kuuldu +kuuletu +kuulu +kuuluta +kuum +kuuma +kuumene +kuumle +kuumu +küündi +kuune +kuu=ne +küünita +küünitu +kuupäev +kuu_päev +kuurdu +küürdu +küüri +küüru +kuurusta +küüruta +küüsi +küüsis +küüsist +küüti +küütle +küütsuta +kuvastu +kvaaku +kvalifitseeru +kviteeri +laaberda +laabu +laad +laadi +laadne +laadu +laagerdu +laamenda +lääpa +lääs +laasi +laasu +laatsata +laatu +laba=ne +labastu +läbe +läbi +labialiseeru +läbista +läbistu +läbi_viimine +ladene +ladestu +lädise +ladu +lae +laeku +laena +laenu +laenuta +lagedale +lagestu +lagise +lagista +lagu +lagunda +lagune +lähedal +lähedale +lähedalt +lahenda +lähenda +lahendu +lahene +lähene +lähestu +läheta +lahise +lähistel +lähistele +lähistelt +lahjene +lahju +lahkene +lahkne +lahku +laht +lahti +lahtu +lähtu +lahusta +lahustu +lahuta +lahutu +lai +läi +laia +laiali +laibastu +laiem +laienda +laiene +läigata +läigatle +läiki +läikle +laila +laineta +lainle +läinu +läinud +läi=nud +laiskle +laisku +lait +läit +läitu +laiu +laiune +laiu=ne +laius +laiuta +lakata +lakatle +lakerda +läkita +lakka +lakku +laksu +laksuta +lama +lamandu +lamaskele +lamaskle +lämbu +lamendu +lamene +lameskele +lameskle +lamestu +lammu +lammuta +lämpa +(.*)lane +lange +langeta +längu +lantsi +laokile +laostu +laota +laotu +läpastu +läpata +lapenda +laperda +läppa +lappi +läppu +laps +lapsestu +larise +lärise +lärmitse +lartsu +lärtsu +las +lask +laskma +lask=mine +lasku +lasti +lastu +lastud +las=tud +lasu +läti +Läti +lätistu +lätsa +lätsi +latsu +lätsu +lauge +laugle +laugu +laul +laupäev +lau_päev +lausa +lausu +lausumata +lausu=mata +laut +lavastu +laveeri +lävi +leba +lebaskele +lebaskle +leebi +leebu +leedi +leedu +Leedu +leegitse +leeki +leekle +leemenda +leemeta +leeri +leetu +leevendu +leevene +legaliseeru +lehistu +lehka +lehki +leht +(.*)_leht +Leht +lehti +lehvata +lehverda +lehvi +lehvita +lehvle +leid +leidu +leigene +leigu +leili +leina +lembu +lemmenda +lemmeta +lenda +lendle +lendu +lengerda +lenna +lenna=nu +lennanud +lenna=nud +lennuta +leondu +leostu +leota +lepita +leppi +lesestu +lesi +lesku +levi +levine +levita +libahta +libasta +libastu +liberaalitse +libestu +libise +libista +libistu +libitse +liblenda +lidise +lidista +lidu +ligastu +ligemale +ligi +ligidal +ligidale +ligidalt +ligikaudu +ligi_kaudu +liginda +ligine +ligistu +ligu +ligune +lihatse +lihavne +lihku +lihtsustu +lihu +lihva +lihvata +lihvu +liialt +liia=ne +liibu +liider +(.*)liider +liiderda +liig +liiga +liigagi +liigahta +liigata +liigendu +liigestu +liigita +liigitu +liigu +liiguta +liigutu +liik +liikle +liiku +liimenda +liimerda +liimi +liimitse +liimitusta +liimu +liisu +liit +(.*)_liit +Liit +liiter +liitrine +liitu +liiv +liiva=ne +liivastu +(.*)lik +likvideeru +lillenda +lillerda +lillu +lilluta +limpsi +lina=ne +linastu +linguta +linn +(.*)_linn +Linn +linnas +(.*)_linnas +linnastu +lipenda +liperda +lipitse +liple +lipne +lippa +lippu +lipsa +lipsu +lipustu +lirise +lirtsu +lisa +lisandu +lisane +litata +literda +litsu +litsuma +liugasta +liugle +liugu +liuska +lõbu +lõbune +lõbustu +lõbutse +lodise +lõdise +lodustu +lõdvene +lõdvestu +loend +loendu +loetle +loga +logele +logi +logise +lõgise +loha +lõhastu +lõhene +lõhesta +lõhestu +lohise +lõhise +lohista +lõhke +lõhki +lohku +lõhku +lohmerda +lõhn +lõhna +lõhustu +lohutu +lohva +lohverda +loidu +lõiga +lõiga=nu +lõiganud +lõiga=nud +lõige +loigerda +lõigustu +lõika +loima +lõimetu +loitle +loiva +loivi +lokaliseeru +lokata +lokenda +lõkenda +lokerda +lökita +lokka +lõkka +lõkku +loksa +loksata +loksi +lõksi +loksle +loksu +lõksu +loksuta +lökuta +lollistu +lömastu +lomberda +lömberda +lömita +lömpa +lõmpsata +lõmpsu +lompsuta +londerda +londita +longerda +lõngerda +longu +lõnguta +lonki +lõnksu +lõnku +lonti +lontsi +löntsi +loo +löö +looberda +lööberda +loobi +loobu +loodene +looderda +lõõgastu +loogelda +loogerda +loojene +looju +looka +lookle +looku +lööma +lõõma +lõõmahta +lõõmatele +lõõmatle +lõõmenda +loopi +lõõpi +looritu +looru +lõõsata +loosi +lõõska +looskle +loot +lõõtsata +lõõtsuta +lootus +loovuta +lõp +lopenda +loperda +lõpeta +lõpp +lõppe +loppu +lopsata +lopsu +lõpuks +lõpus +loputa +lörise +lörri +lortsata +lörtsata +lõrtsata +lortsi +lortsu +lörtsu +loruta +losku +lössita +lossuta +lössuta +losuta +lösuta +lõsuta +lotenda +loterda +lõtku +lötsu +lõtvu +lõunane +lõunasta +lõunata +lozheeri +(.*)lt +luba +lubjastu +luge +lugu +lugupeetud +lugu_peetud +lühem +lühene +lühistu +luhta +luhtu +luhvata +luidestu +luiguta +luiluta +luiska +luit +luita +luitu +lüka +lükka +lükki +lukku +lükku +luksata +luksu +lukustu +lulise +lülistu +lülita +lülitu +lullita +lumetu +lumistu +lumita +lümita +lumitu +lummu +lunasta +lüngu +luni +lupju +lupsa +lupsata +lupsu +lurise +lurtsata +lurtsu +lustita +lustitse +luuguta +lüüritse +luusi +luustu +m2 +m3 +ma +maa +(.*)_maa +Maa +maadle +maagistu +maa_kond +maaldu +maali +maandu +maantee +maan_tee +määr +määra +määrama +määramine +määra=mine +määra_mine +määratle +määrdi +määrdu +määri +määrus +maastu +maatustu +maa_vanem +madal +madalam +madaldu +mädane +mädastu +madise +maga +magene +magestu +mägise +magneetu +magnetiseeru +maha +mahene +mäherdune +mähki +mähku +mahla=ne +mahti +mahtu +mahutu +mai +maikuu +mai_kuu +maini +mais +maits +maitse +(.*)maja +Maja +major +majuta +majutu +maks +maksma +maksusta +malbu +malda +mälesta +mälestu +mälestus_kivi +mäleta +mall +man +Man +mana +mander +mandu +mängi +mängima +mängle +mangu +mängu +manifesteeru +manitse +mant +manu +maolda +maota +maraton +märatse +mardisandita +märgu +märka +märki +marodeeri +marrastu +marru +marssi +märts +märtsikuu +märtsi_kuu +maru +märulda +maruta +marutse +masendu +maskeeri +maskeeru +maskuleeru +mäsle +mässa +mässi +mässu +mässutse +mast +mat +(.*)mata +(.*)[=]+mata +(.*[^=])mata +matematiseeru +materda +materialiseeru +matistu +matka +matsata +matsereeru +mätsi +matsu +mattu +mauri +me +mediteeri +meel +meeldi +meeldu +meeleolutse +meele_pära=ne +meelest +meelesta +meelestu +meeliskele +meeliskle +meelita +meene +meenu +meenuta +mees +meeter +meetrine +meetri=ne +mehestu +mehhaniseeru +mehistu +mehkelda +meie +meie_sugune +meisterda +meistri_liiga +Meistri_liiga +melanhoolitse +memmestu +mendeleeru +menstrueeri +merulda +mesitse +metall +metastaseeru +metsastu +metsistu +mi +mida +midagi +migreeri +migreeru +Mihkel +mihuke +miila +miks +mil +Mil(.*) +milj +MILJ(.*) +millal +millimeetrine +milline +milli=ne +min +mina +mindu +mindud +min=dud +mine +(.*)[=]+mine +(.*[^=])mine +(.+)mine +minek +minem +minema +mineraliseeru +minesta +minestu +mineta +mingi +mingisugune +mingi_sugune +minister +(.*)minister +minki +minu_sugune +minut +(.*)minut +minutiline +mis +misjaoks +mis_jaoks +miski +miski P sg part +miski_sugune +mismoodi +mis_moodi +mispärast +mis_pärast +miss +missioon +Missioon +missis +missugune +mis_sugune +mistarvis +mis_tarvis +mister +mistõttu +mis_tõttu +misuke +mitmekesistu +mitmekordistu +mitmes +mitmestu +mitme_sugune +mitte +mitu +mi=tu +mitu_kümmend +mitu_sada +mitu_setu +mobiliseeri +mobiliseeru +moderniseeru +modernitse +modifitseeru +moel +mõel +mõeldud +mõelnud +mõel=nud +möhita +mõhku +möira +möirahta +möirata +möirga +mõis +(.*)_mõis +mõist +mõju +mõjul +mõjusta +mõjuta +moka +mökuta +möla +mõla +mõlema +mõlemad +mõlgistu +mõlguta +mölista +mõlki +mõlku +mölla +moluta +moment +momise +mõne_sugune +mõni +mõningane +mõningas +monopoliseeru +monteeri +mõnule +monument +mõnutse +mood +mööda +möödane +moodi +möödu +moodusta +moodustu +möögi +mööki +möön +mõõna +moonda +moondu +mõõt +möra +moraalitse +moraliseeri +mõrane +mõrastu +mörata +mõrka +mornistu +mornitse +mõrune +mõrustu +mõte +mõtel +mõtel=nu +mõtelnud +mõtel=nud +mõtesta +mõtestu +mõtiskle +motiveeri +mõtle +mõtlema +motoriseeru +möura +mõura +mr +ms +msc +mtü +MTÜ +mudastu +müdise +müdista +müdra +mugandu +mugane +mügardu +mugatse +mugavdu +mugavle +mugavne +mügerdu +mügi +müha +muhele +muhene +mühise +mühki +muhku +muiahta +muiata +muidu +muidugi +muiga +muigle +muiska +muiskle +muist +mükerda +mukki +müksa +müksle +mula +muldu +mulgustu +mulise +mulju +mulksata +mulksu +mülla +mulla=ne +mullastu +mulle +mullita +mullitse +mültu +mumifitseeru +mumise +munserda +munsterda +müra +mürahta +murasta +murd +murdma +murdu +murele +murene +mürgelda +mürgi=ne +mürgistu +mürgitu +mürise +mürra +mürtsu +mürtu +murustu +müsele +muserdu +musitseeri +must +mustenda +mustene +mustu +muteeru +mütsu +muu +müü +müübi +muugi +müügi +muula=ne +muuma +muumiastu +muundu +müüra +müürga +müüri +muut +muutu +naabrutse +naakle +nääkle +nääksata +naaldu +näändi +naa_sugune +nabi +nabise +näbrasta +nädal +(.*)nädal +nädala=ne +nädala_päev +naeldu +naelu +naeluta +naelutu +naer +naerata +naeratele +naeratle +naerle +naeruvääristu +näeta +näge +nagele +nägele +nägema +nägi +nagise +nägise +nagu +nägu +nägurda +nahastu +nahise +nahka +nahku +nähtu +nähva +näi +näida +näida=nu +näidanud +näida=nud +näide +naiivitse +näikse +naine +näinu +näinud +näi_nud +naisestu +nait +näita +näiteks +naitu +näitu +najal +najale +najalt +najatu +nakata +nakatu +nakka +näkka +nakku +naksu +nali +nälja=ne +naljata +naljatele +naljatle +näol +näpista +nappi +napsita +napsuta +näpu_vahele +närbi +närbu +näri +narise +närise +narmastu +narmenda +narrita +närtsi +närtsu +närune +närustu +närvastu +närveeri +närvelda +närvetu +närvitse +närvle +nasaleeru +näsele +nasise +nätsata +natsionaliseeri +nätsku +natsu +nätsu +natti +natuke +naturaliseeru +näugata +näuksata +näuksu +nauti +(.*)ne +neela +neelata +neeldu +neeti +neid +neiu +neiuke +neiu=ke +neljakordistu +neljandik +neljapäev +nelja_päev +nemad +nende_sugune +nenti +nesti +neutraliseeru +nigise +nihele +nihestu +nihku +nihuke +nihukene +nihuta +nihutu +nihvata +nihvelda +nihverda +nii +niiksata +niiksu +niipea +nii_pea +niisama +nii_sama +niisamasugune +nii_sama_sugune +niisama_sugune +niisamuti +nii_samuti +niisku +niisugune +nii_sugune +niisuke +niit +niitsata +niitsu +niivõrd +nii_võrd +nikastu +niksu +nimel +nime=line +nimeta +nimetus +nimi +nina=ne +ning +nipita +nireta +nirgu +nirise +nirtsuta +nisuke +nisukene +nitrifitseeru +niugu +niuksu +niutsu +nivelleeru +niverdu +(.*)nna +nõdrene +nõela +nõetu +nõgu +nõgune +noh +nohata +nohhita +nohise +nohista +nohuta +nõidu +nõjal +nõjatu +nokasta +nokastu +nõksata +nõksatu +nõksle +nõksu +nolguta +noluta +nomadiseeri +nõnda +nooguta +nooku +nooldu +noomi +noor +noordu +nöördu +noorene +nööri +noorutse +noosku +noppi +nördi +nõrenda +nõreta +nõrgene +nõrgestu +norgu +nõrgu +norguta +nõrise +nõristu +nõrke +nõrku +normaliseeru +norra +Norra +norska +nortsu +nõruta +nosise +nõtkestu +nõtku +nõtru +notti +nõu +nõud +nõue +nõukogu +nõu_kogu +nõus +nõuskle +nõus_olek +nõustu +nõuta +nõutle +november +nr +(.*)nt +(.*)[=]+[ntd]u +([^=]*)[ntd]u +(.*)[=]+[ntd]ud +([^=]*)[ntd]ud +nüansseeru +(.*)nud +nudistu +nugi +nuhata +nühele +nuhise +nuhki +nühki +nühku +nuhtle +nukku +nukru +nukrustle +nukrustu +nukrutse +null +number +nuple +nuppu +nuputa +nura +nurata +nürine +nurise +nüristu +nurja +nurju +nurkle +nurku +nurru +nuru +nut +nüüd +nuuksata +nuuksu +nuumu +nuusuta +nuutsa +nuutsata +nuutsu +objektiveeru +oblitereeru +observatoorium +Observatoorium +odavne +(.*)_õde +öel +öel=nu +öelnud +öel=nud +õelutse +ogardu +õgi +õgvene +ohata +ohatu +õhene +õheta +ohheta +õhk +ohka +õhka +ohkle +õhku +ohoota +ohtralt +ohtu +õhtu +õhtune +õhtu=ne +õhtu_poolik +õhuta +ohverda +ohverdu +oiata +õidu +õienda +oiga +õige +õigene +oigle +õigus +õigusta +õigustu +õilmitse +oimetu +oimitse +öine +öi=ne +oistu +õitse +oiuta +oja +okastu +okludeeru +oksenda +oksüdeeru +oktoober +ol +ole +olek +olele +olema +olene +oleskle +oleta +oletse +olgugi +olnu +olnud +ol=nud +õlul +õlule +oluliselt +õlult +oma +oma_jagu +omaks +oma_moodi +omanda +omane +oma_pära=ne +omapära=ne +omapäratse +oma_sugune +omavolitse +õmble +omista +õnge +õngitse +õnne +õnneks +õnnesta +õnnestu +õnnista +õnnitle +öö +ööbi +öögata +ööki +õõndu +õõnesta +ööstu +oota +õõtsata +õõtsu +õõtsuta +õpeta +õpetaja +õppi +opteeru +orastu +organisatsioon +(.*)_organisatsioon +Organisatsioon +organiseeri +organiseeru +orienteeri +orienteeru +originaalitse +orkester +(.*)_orkester +Orkester +õrnle +õrnu +õrnutse +õrrita +õrritu +osa +osakond +(.*)_osakond +Osakond +osale +osaleja +osale=mine +osalus +(.*)_osalus +Osalus +osane +osapool +osa_pool +osa_riik +osas +oska +oskus +osoneeru +ost +osuta +osutu +ots +otsa +otsama +otsas +otsast +otse +otsekui +otse_kui +otsi +otsus +otsusta +otsustu +oü +OÜ +õunad on subjekt +paadu +paagata +paagatu +paaku +paanitse +paar +(.*)_paar +paardu +paari +pääri +paarima +paaritu +paaru +pääse +pääsema +pääst +paastu +paatsi +paatu +pabista +padise +pae=ti +päev +(.*)_päev +päevane +päeva=ne +päevita +page +pagenda +pagise +pahanda +pahandu +pahane +paha=ne +pahatse +pähe +pahene +pähhita +pahise +pahmi +pahtu +pahuksisse +pahurda +pahutse +pahva +pahvata +paigalda +paigaldu +paigale +paigustu +paiguta +paigutu +paik +päike +Päike +paikne +paiku +paina +paindu +painuta +paisa +paisanud +paisa=nud +paisatud +paiska +paisku +paist +paisteta +paistetu +paisu +paisuta +paita +päitu +pajata +pakahta +pakata +pakita +pakitse +pakk +pakki +pakku +pakkuma +paksene +paksu +paksune +paksu=ne +paksustu +paku +pakundu +pakuta +pakutama +paku=tu +pakutud +paku=tud +pala +palavne +palistu +paljane +paljasta +paljastu +palju +paljud +paljune +palka +palmi +palmu +palu +paluma +palve +palveta +palvle +pan +pänderda +panditu +pane +panema +panetu +pank +panku +pann +pannalda +panni_täis +pannu +pannud +pan=nud +panti +pantsata +paoskle +paotu +par +par. +pära +paragrahv +parajasti +pärale +päralt +paranda +päranda +pärandu +parane +parasiteeri +pärast +parem +paremale +parene +päri +parim +pärine +päris +parise +päriselt +parista +pärit_olu +park +(.*)_park +Park +parki +parku +parlament +(.*)_parlament +Parlament +pärlenda +pärmi +parodeeri +parselda +pärssu +partei +(.*)_partei +Partei +parteistu +partsata +parvle +pasanda +pässandu +passi +passima +pasunda +patakas +paterda +päterda +pätistu +patra +päts +patsa +patsata +patseeri +patserda +pätserda +patsi +pätsi +patsu +patsuta +patusta +paugata +pauki +pauku +pea +peaaegu +pea_aegu +peal +peale +peale_poole +pealistu +pealkirjasta +pealne +peal_pool +pealt +pealt_poolt +peamine +peamiselt +peapiiskop +pea_piiskop +peas +peatu +peatükk +pea_tükk +pebrestu +peegelda +peegeldu +peendu +peenene +peenestu +peenutse +peeta +peetama +peetu +pehastu +pehki +pehmene +pehmu +peibuta +peit +peitu +peks +peksi +peksle +peksu +peleta +pelga +pelgu +peluta +pendi +pendu +pensioneeru +peo_täis +periheel +periood +(.*)_periood +perrooni_tolm +personifitseeri +peruta +pese +pesitse +pesitu +pesu_väel +pet +petiskele +petiskle +pettu +phd +pida +pidama +pidi +pidurda +pidurdu +pidusta +pidutse +pigem +pigista +pigistu +pihk +pihka +pihku +pihta +pihti +pihustu +piibita +piibuta +piiga +piigasta +piiksa +piiksata +piiksu +piil +piilu +piiluta +piim +piimastu +piinle +piinu +piira +piirdu +piires +piiristu +piirne +piisa +piisavalt +piiskop +piitsuta +pikali +pikem +pikene +piki +pikk +pikku +pikkune +pikku=ne +pikkus +piku=ne +pikuta +pilbastu +pildistu +pildu +pilet +pilguta +pilka +pilku +pilla +pilli +pilluta +pilune +pilvi=ne +pilvita +pimendu +pimene +pimestu +pindu +pinevne +pingesta +pingestu +pinguldu +pingustu +pinguta +pingutama +pinise +pinni +piparda +pipardu +pipra=ne +piraaditse +pirise +pirtsuta +piselrda +pisene +piserda +pissi +pissu +pist +pistma +pisut +pitsitu +pitsu +piuksata +piuksu +plaan +plaani +plaanitse +plaatsata +pladise +plädise +plagise +plahvata +plaksata +plaksu +planeeri +planeet +plara +plarise +plärise +plartsa +plartsata +plartsu +plärtsu +platra +platsu +plätsuta +pleeki +plinki +plirise +plõgisel +plõksata +plõksu +plõnksu +plötsi +plumpsa +plumpsata +plumpsu +põde +podise +poeg +poegi +poeta +poetu +põgene +pogise +põhi +põhine +põhjal +põhjenda +põhjendu +põhjene +põhjus +pohmitse +poi +põika +põikle +põiku +põimi +põimle +põimu +poiss +poiss_laps +põkku +poksle +põksu +põldu +põle +polemiseeri +põleta +põletikustu +põlga +polikliinik +põlistu +politiseeri +polk +polümeeru +polümeriseeru +põlv +põlvili +põlvita +põlvne +pomise +põmise +pommi +pommita +põmmuta +põndu +põnevustu +põnku +põnni +ponnistu +ponsu +pontsu +põntsu +põntsuta +poo +pooki +pool +poolastu +pooldu +poole +pooleli +poolest +poolitu +poolne +poolt +pool_teist +põõna +põõnuta +pööra +pööra=ne +pöörastu +pööratse +pöördu +pöörita +pöörle +pooseta +popsu +popsuta +põrahta +porise +põrise +põrka +põrki +põrku +põrmne +põrmustu +porsu +põru +põruta +posi +poti_täis +potsata +potsu +pott +põuastu +praadi +praaki +prääksata +prääksu +praali +praavi +praga +pragise +pragune +prahvata +praksu +prantsata +prantsuspärastu +prantsuspäratse +prantsustu +prassi +praugata +(.*)_prefektuur +preili +premeeri +presideeri +president +pressi +pressima +pretendeeri +prevaleeri +prigise +priskene +pritsi +pritsu +probleemitse +professor +programm +progresseeru +projekt +Projekt +prõksata +prööka +prooska +proov +proovi +prostitueeri +protesteeri +protsent +protsessi +proua +provints +provotseeri +pruuki +pruunistu +pruusata +pruuska +pubise +pudel +püdeldu +pudene +puder +pudise +pudune +püga +pügaldu +puge +(.*)püha +pühapäev +püha_päev +puhasta +pühenda +pühendu +püherda +pühitse +puhk +puhka +puhke +pühki +puhkle +puhtu +puhu +puhul +puhvi +puies_tee +puika +puikle +puista +puistu +puitu +pujahta +pukerda +püks +pukseeri +puksi +puksle +puksu +pulbitse +pulku +pullistu +pulmita +pulmitse +pulseeri +pulstu +pummelda +pumpa +pumpsata +punasta +punastu +pundu +punenda +puneta +püngi +pungitu +pungle +pungu +punkt +punkti +punktima +punni +punnita +punnitu +punsu +punt +punu +punundu +puperda +pupsu +purdu +pure +purele +pureta +pürgi +purgu +purise +purista +purjeta +purju +purjuta +purska +pursu +puruks +purune +purusta +purustu +pusi +püsi +püsistu +puski +pusserda +pussita +pussuta +püsti +püstita +püstitu +puterda +putita +putka +putkastu +putkestu +putku +putra +püüd +puuderdu +puudu +puuduta +püüe +puuki +puurdu +puuri +puurle +puustu +puutu +puutu=v +(.*)_raadio +räägata +raagu +rääki +rääkima +raamastu +raames +raami +raamistu +raamu +raasi +räästu +raasu +rääsu +raasuke +raatsi +raba +räbaldu +rabastu +rabele +rabendu +rabene +rabestu +rabise +rada +radise +rädise +raevle +raevu +raevutse +rägele +ragene +ragise +rägise +raha +rahesta +rähkle +rahku +rähmastu +rahmelda +rähmelda +rahmitse +rähmu +rahul +rahuldu +rahule +rahune +rahusta +rahustu +rahvas +(.*)_rahvas +Rahvas +rahvasta +rahvastu +rahvus +(.*)_rahvus +Rahvus +rahvusta +rahvustu +räidi +räihta +raipne +raipu +raiska +raisku +raiu +raiuma +rajandu +rajane +rajatu +rajutu +rakata +rakenda +rakendu +rakindu +rakku +raksata +räksata +raksu +räksu +ralli +rämbi +rambu +rammestu +rammi +rammustu +rampu +rända +randu +ränistu +ränna +räntsata +räpa=ne +räpastu +raple +rappu +rapsi +rapsu +raputa +rasestu +raske +raskendu +raskene +rasku +raspelda +rässi +rasvastu +rasvne +rasvu +raugastu +rauge +raugene +raugestu +rausata +ravi +ravitse +reageeri +realiseeri +realiseeru +reasta +reastu +rebene +rebesta +rebestu +rebi +rebima +reede +reegli_pära=ne +reeglipärastu +reegli_põhi=ne +reen +reet +reevakueeru +refereeri +reflekteeru +regenereeru +registreeri +registreeru +reguleeru +rehabiliteeru +rehkenda +reibastu +reis +reisi +reivle +reklaami +rekordita +rekorditse +rekvireeri +relvasta +relvastu +remilitariseeru +remonti +renteeri +renti +reorganiseeri +reostu +repatrieeru +repeta +repetu +repne +resideeri +resoneeru +resorbeeru +resulteeri +resulteeru +resümeeri +retsidiveeru +revolutsioneeru +ribele +ribise +rida +ridane +rigise +riidle +riieta +riietu +riigi_kogu +riigistu +riik +(.*)_riik +Riik +riimu +riisu +riiva +rikastu +rikestu +rikki +rikku +rikne +rikundu +ring +ringi +ringis +ringita +ringle +ringuta +rinnulda +rinnuta +ripakile +ripenda +riple +ripne +rippu +riputa +risk +riskeeri +riski +rista +risti +ristle +ristu +risustu +riukle +riuksu +rivaalitse +rivista +rivistu +robise +rögi +rõgise +roheldu +rohelistu +rohenda +roheta +röhitse +rohkem +rohkene +rohkesti +röhki +röhmitse +rohtu +rõhu +rõhuta +roida +roidu +roimle +roina +roisku +rõivasta +rõivastu +rokeeri +rõkka +rõksle +romaani +romahta +romaniseeru +rõmpsu +rondu +rõngastu +roni +roniskle +rõnssa +rooja=ne +roojastu +rooki +rööki +rõõm +rooma +roomaskele +roomaskle +roomastu +roomle +rõõmle +rõõmusta +rõõmustama +rõõmustele +rõõmustle +rõõmustu +rõõmutse +roosata +roosatu +rooska +roosteta +roostetu +roostu +rootsi +Rootsi +rootsistu +rootsu +röötsuta +röövi +ropenda +ropsata +roputse +rübele +rübi +rubise +rublane +rubla=ne +rudise +rühki +rühkle +rühm +rühma +rühmelda +rühmita +rühmitu +rühmu +ruiga +ruigata +ruigle +ruineeru +ruleeri +rulla +rulli +rullu +rumaldu +ründa +rüntsata +rüsele +ruska +rusku +russifitseeru +rusu +rusune +rusustu +rutle +rutta +ruttu +ruuga +rüüpa +rüüpi +ruupsi +rüüsta +rüütu +sa +saa +saabu +saad +saadik +saagi +saagitse +saali +saama +säärane +saardu +saare_maa +Saaremaa +Saare_maa +sääst +saastu +säästu +saat +saatel +saati +saavuta +sabi +sabise +sabitse +sada +sadakond +sädele +sadestu +sadine +sädista +sadu +sadulda +sageli +sagene +sagi +säherdune +sahise +sahkerda +sähvata +sähvi +säili +sajand +sakerda +sakri +sakru +Saks +saksa +Saksa +saksaskle +saksastu +salakuula +sala_pära=ne +salapära=ne +salene +salga +salku +sälku +salli +saluteeri +sama +samalaadne +sama_laadne +samane +sama_palju +samas +samasugune +sama_sugune +samle +samm +sammu +sämpu +samuti +sanktsioneeri +saple +säpru +sapsata +säpsata +sära +särdu +särele +särise +sarja +sarnane +sarnle +särtsata +särtsu +säru +sasele +sasise +säsise +sassi +sassu +sätenda +sätti +sattu +säuksu +säutsu +(.*)se +sead +seadus +(.*)seadus +seal +sealpool +seal_pool +sealt +sealtpoolt +sealt_poolt +seas +seast +seastu +§ +seda +see +seebita +seedi +seedle +seedu +seega +seemendu +seepärast +see_pärast +seepu +sees +seesama +see_sama +seesinane +see_sinane +sees_pool +seest +seest_poolt +seesugune +see_sugune +seetõttu +see_tõttu +sega +segamini +segele +segi +segundu +segune +segustu +seigu +seikle +seira +seis +seisata +seisne +seisund +seisuta +sekelda +sekka +sekretär +sekund +(.*)_sekund +sekundiline +sel +selekteeri +seleta +seletu +selg +selga +selgendu +selgita +selgne +selgu +selile +selili +selitu +seljas +seljataha +sellal +selleks +sellepärast +selle_pärast +selletaoline +selle_taoline +selletõttu +selle_tõttu +selline +selli=ne +selmet +selti +selts +(.*)_selts +Selts +seltsi +seltsimees +seltsi_mees +seltsis +seltsist +seltu +selvi +sembu +semitiseeru +semutse +senat +Senat +sendiline +seni +senikaua +seni_kaua +sensei +sentimeetrine +seo +seos +seoses +seosta +seostu +seotu +seotud +seo=tud +separeeru +september +serva +servama +serveeri +servistu +sest +sestpeale +sest_peale +sestsaadik +sest_saadik +setti +setustu +shokeeri +siba +sibele +sibi +sibli +sidi +sidu +sidundu +šifreeri +sigarett +sigatse +sigi +sihise +sihka +sihti +sihuke +siia +siiapoole +siia_poole +siiberda +siidi=ne +siin +siinpool +siin_pool +siirda +siirdu +siirle +siis +siiski +siit +siitpoolt +siit_poolt +siitsealt +siit_sealt +sikuta +sila +silbu +sildu +sileeru +silene +silise +silita +silka +silksa +silksata +silksu +silku +sillerda +silm +silma +silmi +silmitsi +siltsu +silu +sina +sinenda +sinerda +singu +sinine +sinna +sinnapoole +sinna_poole +sinu_sugune +siple +sirenda +sireta +sirgestu +sirgu +sirise +sirka +sirtsu +siru +siruli +siruta +sirutele +sirutle +sirutu +sisalda +sisaldu +sisenda +sisesta +sisise +sisista +siss +sisse +sisse_poole +sisune +sitkene +sitkestu +sitku +sitsi +sittu +siugata +siuke +siukene +siuksu +siuna +siutsu +skeem +s^likerda +snoobitse +söanda +sobele +sobi +sobise +sobitu +sõbrusta +sõbrutse +sõdi +sõelu +soeta +sogane +soga=ne +sogastu +sõgene +sohi +sohise +sohki +sõide +sõiduta +soigu +soiku +sõima +sõimle +soini +sõit +sõitle +sokk +sokuta +sola +solgata +solise +solka +solkne +solksata +solku +sõlmi +sõlmu +sõltu +solva +solvu +sõma +sõmerdu +sompu +sõna +sõnasta +sondu +sõnele +songle +söö +soodusta +soojenda +soojendu +soojene +sool +soola +soola=ne +sooldu +soome +Soome +soomitse +sõõna +soondu +soonistu +soopa +sõõrata +sõõrdu +soorita +sööst +soostu +sööt +sööti +soov +soovi +soovita +soovitus +soperda +sopista +sopra +sora +sordu +sorgelda +sori +sörki +sort +sorteeri +sorti +sortsu +sõtku +sotsialiseeru +sõud +spetsialiseeru +staar +staatus +stagneeru +starti +steppi +stimuleeri +strateegia +sublimeeru +sudi +süga +sügav +sügavune +sügavus +suge +sügele +sugene +sugi +sügis +sugu +suhkur +suhtes +suhtle +suhtu +suhtu=v +suigata +suiguta +suistu +suitse +suitseta +suju +sukeldu +sukk +sula +sulandu +sulane +sulata +sule +sulestu +sule=tu +suletud +sule=tud +sülga +sulge +sulgi +sulgne +sulgu +suli +sulise +sülita +sulju +sulpsata +sulpsi +sulpsle +suma +sumata +sümboliseeri +sumbu +sumise +summeeru +summerda +summi +summu +sumpa +sümpoosion +Sümpoosion +sümpoosium +Sümpoosium +sundi +sündi +süngestu +sunni +sunnil +sünnita +sünteesi +superda +suple +supp +sure +sureta +surise +surm +surtsata +suru +survu +susise +suspendeeru +suss +süst +süsti +süstima +süstma +sütita +sütti +süü +suubu +süüdi +süüdista +süüdistu +suudle +suuna +suunas +suunast +suunatud +suund +suunda +suundu +suunduv +suur +suurdu +suured apelsinid, väikesed õunad +suurenda +suurene +suursaatkond +suur_saatkond +suurune +suurus +suuruselt +suurusta +suurustele +suurustle +suut +süüta +süüvi +suvatse +süvene +suvi +suvita +ta +taagelda +taandu +taani +Taani +taaru +taaruta +taas +taba +tabel +taeva=ne +taga +tagane +tagant +tagant_poolt +taga_pool +taga_poole +tagasi +tagasta +tagu +tagune +tagurda +taha +tahapoole +taha_poole +tahe +tähelda +tähele +tähelepanu +tähele_panu +tähenda +tahene +tahes +tähista +tahku +taht +tähti +tähtima +tahtmine +tähtsusta +tahu +täi +täidu +täiene +täiesti +taipa +täis +täit +täita +täitu +täiustu +taju +takerdu +takista +takistus +takitse +takk +takka +täksi +talitse +tall +talla +talleta +talletu +taltsu +taltu +taluta +talv +talvita +talvitu +tammu +täna +tänama +tänav +tänita +tantsi +tantsiskle +tantsle +tänu +taoline +taotle +taotse +tap +taple +tapsi +täpsusta +täpsustu +(.*)tar +tära +tardu +targuta +targutele +targutle +targutse +tärise +tärista +tärka +tärlenda +tarretu +tarvis +tarvita +tarvitse +tasa +tasakaalusta +tasand +tase +tassi +tasu +tatsa +tätsa +tatsu +tatsuta +(.*)[=]+tav +(.*[^=])tav +tava +tavatse +te +tead +teade +teata +teater +(.*)_teater +Teater +teatud +tea=tud +tedele +tee +tee_kond +teel +teeni +teenitse +teeskle +tege +tegele +tegemine +tege=mine +tegi +tegu +tegutse +tehnikum +(.*)_tehnikum +Tehnikum +tehtle +tei +teie +teie_sugune +teine +teineteise +teine_teise +teinu +teinud +tei=nud +teiseks +teisendu +teisene +teisipäev +teisi_päev +teis_pool +teis_poole +teis_poolt +teis_sugune +teist_moodi +teist_sugune +teisu +tekita +tekitatud +tekitav +tekki +telefon +telegrafeeri +telki +tellerda +telli +tema +tema_sugune +tembelda +tembuta +temmelda +tempereeru +tempi +tendeeri +tendi +teosta +teostu +teotse +tera=ne +teravmeelitse +teravne +tere +tereta +teritu +terve +tervene +tervistu +tervita +tibi +tihene +tihka +tihku +tihti +tihuta +tiiksu +tiim +Tiim +tiirle +tiitel +tiivle +tikku +tiksata +tiksu +tilguta +tilise +tilka +tilku +tiltsata +tiltsu +timmi +timpi +tingi +tingitud +tingle +tinise +tippa +tipsuta +tiri +tirtsata +tituleeri +tiuksata +tiuksu +tiuksuta +tõbi=ne +tõde +toel +tõenäosus +tõenda +tõesti +tõestu +toeta +toetu +töga +togi +tohleta +tohti +tõhu +tohutult +toibu +toime +toimeta +toimi +toimu +toimumine +toimuv +töina +toit +toitu +tõke +tokerda +tokerdu +tõkesta +toksa +tolgenda +tolkne +töllerda +tölluta +tolm +tolma +tolmu +tolmu=ne +tolmuta +tõmba +tömbistu +tõmble +tõmbu +tõmma +tõmma=tu +tõmmatud +tõmma=tud +tõmmuta +tompu +tõngu +tönka +tonksa +tonn +tonnine +too +töö +(.*)_töö_andja +töö_andja +Töö_andja +toorene +toorestu +toorutse +toosama +too_sama +tõota +tööta +tõotus +toppa +toppi +toppu +(.*)tor +tore +tõrele +toretse +torga +torganud +torga=nud +tõrgasta +torgatud +torise +törise +tõrju +torka +torki +tõrku +torma +törruta +tortsata +tõsi +tosin +tõsine +tõsi=ne +toss +tossa +tossu +tõst +tõste +tõstu +tõtle +totru +totsu +tõtta +tõttu +tõu +tõug +tõuga +tõuga=nu +tõuganud +tõuga=nud +tõuka +tõukel +tõukle +tõuku +tõus +traagelda +traali +traavi +traditsioon +(.*)_traditsioon +trahvi +trallerda +tralli +trampi +trample +transformeeru +treener +treeni +trehva +trei +triivi +trimpa +triumfeeri +trobi_kond +trombu +tröödelda +trooni +trügi +trükindus +trükki +trummelda +trumpa +tsentuurio +tseremoonitse +tsirkuleeri +tsiteeri +(.*)tud +tudi +tüdi +tüdine +tüdruk +tuge +tugema +tugev +tugevne +tugine +tuh +tuhat +tuhatkond +tuhise +tühistu +tühjenda +tühjene +tühju +tuhk +tuhmu +tuhni +tuika +tuikle +tuiku +tuimene +tuimu +tuisa +tuisa=nu +tuisanud +tuisa=nud +tuiska +tujutse +tükk +tükki +tükkis +tukku +tuksata +tuksi +tuksu +tul +tule +tulek +tulema +tulene +tulenevalt +tuleta +tuletu +tuli +tulista +tülitse +tülli +tulnu +tulnud +tul=nud +tulvil +tumene +tümise +tummu +tümpsu +tund +(.*)_tund +tundma +tundu +tungi +tunnine +tunnista +tuntud +tupru +turbu +turdu +turgata +turguta +turni +turri +tursu +turtsu +turvu +tusa=ne +tüsene +tuska +tuskle +tusti +tütar +tütar_laps +tuttav +tuttu +tutvu +tutvusta +tutvustus +tuul +tuula +tuuldu +tuuluta +tuumenda +tüüp +tuupi +tüüri +tuuselda +tuusti +tüüta +udutse +üha +ühe +ühele_poole +ühel_pool +ühelt_poolt +ühenda +ühend_riik +ühendu +ühendus +(.*)_ühendus +Ühendus +ühes +ühine +ühing +(.*)_ühing +Ühing +uhise +uhka +uhkelda +uhkle +uhkusta +ühma +ühmata +uht +ühte +ühti +ühtlusta +uhtu +ühtu +uhundu +uima=ne +uinu +uiskle +uisuta +uita +uitle +uitseta +uju +ujula +ujuta +ukerda +ukraina +Ukraina +üks +üksi +ükski +üksnes +üksteise +üks_teise +ülal +ülal_pool +ülalt_poolt +ulata +ulatama +ulatse +ulatu +ulatus +üldi +üldista +üle +üle_eelmine +üle_eile +üle_homme +üle_järgmine +ülemine +ülenda +ülene +üles +ülesalla +üles_alla +ülespoole +üles_poole +ülesse +ületa +ülevaade +üleval +ülevalpool +üleval_pool +ülevalt +ülevaltpoolt +ülevalt_poolt +ulgu +üli_kool +Üli_kool +ülista +uljutse +üllata +üllatu +ulmi +ulu +ümahta +ümardu +ümata +umbe +ümber +ümbert +umbes +umb_kaudu +ümbritse +umbrohtu +umbusalda +umise +ümise +ümista +umme +ümmistu +ummuksisse +unarule +unda +unele +uni=ne +Unioon +unista +unistele +unistus +untsu +unusta +upita +uppi +uppu +üpris +uputa +ura +ürahta +urata +urba +urbne +urgitse +ürita +üritus +usalda +usk +usku +üsna +ussita +usutle +ütle +ütlema +utsita +uubu +uudista +uudu +uuendu +uuene +uurdu +üürga +uuri +üüri +uurista +uuta +(.*)v +(.*)[=]+v +va +v.a +vaada +vaada=nu +vaadanud +vaada=nud +vaagi +vääksata +vääksu +vaaku +vääna +väändu +väärata +väärdu +vääri +väärt +väärtu +väärtustu +vaaru +vääruta +vaata +vaatama +vaatamata +vaatle +vabanda +vabandu +vabane +vaba_riik +vabasta +vabastama +vabastamine +vabasta=mine +vabasta_mine +vabastu +vabise +vadele +vadi +vadista +väel +väele +vaesu +väeta +vaeva=ne +vaevle +vaevu +väga +vagatse +vägev +vagluta +vaglutu +vähe +vahel +vaheldu +vahele +vaheline +vahelt +vähem +vähemalt +vahemik +vähenda +vahene +vähene +vahest +vaheta +vahetu +vähkre +vahti +vahtima +vahtu +vahuta +vahutse +vahvu +vaibu +vaid +vaidle +vaigata +vaigistu +väike +vaiki +vaimustu +vait +väit +väita +väitle +vaja +vajadus +vajaka +vaju +vajuta +vakata +vakka +vaktsineeri +vala +vald +(.*)_vald +Vald +valda +valenda +valeta +valge +valgene +valgevene +valge_vene +Valgevene +Valge_vene +valgu +valgus +valgusta +valgustu +välguta +vali +välista +valitse +valitsus +Valitsus +välja +välja_poole +väljas_pool +väljasta +väljast_poolt +välja_vaade +väljenda +väljendu +valjene +väljene +välju +välkle +välku +valla +vallanda +vallandu +vallatle +valluta +valmi +valmidus +valmis +valmis_olek +valmista +valmistu +valssi +välta +vältel +välti +valu +välu +valuta +valva +vana +vana_ema +vana_isa +vanane +vanderda +vandu +vanem +vanguta +vanku +vanne +vannuta +vantsi +väntsu +vanu +vanune +vaple +vappu +vara +värahta +varal +varale +vara=ne +värba +värele +varem +vari +varise +varja +varju +varjus +varjust +varjuta +varjutu +varmasti +värskendu +värskene +varssu +varsti +vartu +varu +varul +varustu +värv +värvi +värvistu +värvu +vasakule +vasemale +väsi +(.*)_väsi +väsita +vassi +vassu +vasta +vastamisi +vastanda +vastane +vasta=ne +vastas +vastassugu +vastas_sugu +vastast +vastavalt +vastu +vaterda +vatra +veda +vedama +vedelik +veebruar +veel +veelgi +veen +veendu +veeni +veerand +veerde +veere +veeres +veerest +veereta +veeri +veet +veetle +vehki +vehkle +veiderda +veidi +veidike +veikle +velple +vene +Vene +venemaa +vene_maa +Venemaa +Vene_maa +venestu +veni +venita +vereta +veritse +vesi +vesista +vesitse +vest +vestle +vetru +vetti +vibale +vibele +vibise +vibreeri +vibuta +vidu +vigasta +vigastu +viha=ne +vihasta +vihastu +vihise +vihka +vihma +vihma=ne +vihtu +vii +viibi +viibuta +viiendik +viiksata +viiksu +viil +viili +viim +viimane +viima=ne +viimistle +viin +viirastu +viis +Vii[s|e]+(.*) +VII[S|E]+(.*) +viisi +viisil +viitle +viitsi +viivita +viivitle +viksi +vila +vilgata +vilista +vilju +vilkle +vilksa +vilksata +vilksi +vilksu +vilku +villa=ne +villi +viltu +vilu +vindu +vingerda +vinise +vinti +vintsku +vintsle +vintsu +viratse +virise +virn +virtsu +viru +viruta +virvenda +visa +visanud +visa=nud +visatud +viserda +(.*)_visioon +visise +viska +viskama +visku +vist +viunu +võbele +võbise +võdise +võe +või +võibolla +võib_olla +võib-olla +võidu +võiduma +võidutse +võim +võima +võimalda +võimaldav +võimaldu +võimalik +võimalus +võimas +võime +võimenda +võimutse +võistleja +võistle=mine +võistlus +(.*)_võistlus +võistu +võit +võitle +võitlus +voldine +võlgu +volise +volita +volksa +volksata +volti +voltu +võlu +võlvu +vonkle +võnkle +võnku +võnni +vood +vooda +voola +vooldu +võõpa +võõranda +võõrandu +võõrdu +võõritse +võõrusta +võõruta +vöötle +vöötu +võpata +võpatle +võppu +vopsata +võrdle +võrdne +võrdsusta +võrdu +võrise +vorm +vormi +vormu +võrra +võrsu +võsa=ne +võt +vudi +vuha +vuhise +vuhki +vula +vulise +vulksa +vulksata +vulpsa +vulpsata +vuntsi +vupsa +vupsata +vura +vurhvi +vurise +vurista +vurtsa +vürtsita +vurtsu +vusise +vusserda +vussi + Y) ( +z^estikuleeri +z^ongleeri diff --git a/confs/conf_edt_v211_Stanza_ME_full.ini b/confs/conf_edt_v211_Stanza_ME_full.ini new file mode 100644 index 00000000..82bbdd8a --- /dev/null +++ b/confs/conf_edt_v211_Stanza_ME_full.ini @@ -0,0 +1,69 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.11 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.11/preannotated/morph_extended + +[join_morph_extended_train_full] +input_dir = edt_2.11/preannotated/morph_extended +concatenate = train, dev +output_dir = edt_2.11/preannotated/morph_extended + +[train_stanza_morph_extended_full] +experiment_type = full_data +train_file = edt_2.11/preannotated/morph_extended/et_edt-ud-train-morph_extended.conllu +eval_file = edt_2.11/preannotated/morph_extended/et_edt-ud-dev-morph_extended.conllu +args = --batch_size 5000 +output_dir = edt_2.11/trained/morph_extended/stanza_full_data +model_file = model_morph_extended.pt +predict_after = True +#dry_run = True + +[predict_stanza_morph_extended_full] +experiment_type = full_data +train_file = edt_2.11/preannotated/morph_extended/train_full.conllu +test_file = edt_2.11/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +model_file = edt_2.11/trained/morph_extended/stanza_full_data/model_morph_extended.pt +output_dir = edt_2.11/evaluation/morph_extended/stanza_full_data +#dry_run = True + +#[predict_stanza_tagger_morph_extended_full] +#experiment_type = full_data +#train_file = edt_2.11/preannotated/morph_extended/train_full.conllu +#test_file = edt_2.11/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +#model_file = edt_2.11/trained/morph_extended/stanza_full_data/model_morph_extended.pt +#use_estnltk = True +#seed = 43 +#morph_layer = morph_extended +#output_file_prefix = predicted_stanzatagger_ +#output_dir = edt_2.11/evaluation/morph_extended/stanza_full_data + +[eval_stanza_morph_extended_full_data] +experiment_type = full_data +gold_train = edt_2.11/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.11/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.11/evaluation/morph_extended/stanza_full_data/predicted_train.conllu +predicted_test = edt_2.11/evaluation/morph_extended/stanza_full_data/predicted_test.conllu + +#[eval_stanza_tagger_morph_extended_full_data] +#experiment_type = full_data +#gold_train = edt_2.11/preannotated/morph_extended/train_full.conllu +#gold_test = edt_2.11/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +#predicted_train = edt_2.11/evaluation/morph_extended/stanza_full_data/predicted_stanzatagger_train.conllu +#predicted_test = edt_2.11/evaluation/morph_extended/stanza_full_data/predicted_stanzatagger_test.conllu + +#[eval_stanza_tagger_morph_extended_full_data_error_types] +#experiment_type = full_data +#count_error_types = True +#gold_train = edt_2.11/preannotated/morph_extended/train_full.conllu +#gold_test = edt_2.11/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +#predicted_train = edt_2.11/evaluation/morph_extended/stanza_full_data/predicted_train.conllu +#predicted_test = edt_2.11/evaluation/morph_extended/stanza_full_data/predicted_test.conllu +#name = stanza_ME_error_types +#skip_train = True +#count_words = True +#output_csv_file = edt_2.11/results_stanza_ME_error_types.csv \ No newline at end of file diff --git a/confs/conf_edt_v26_MaltParser_ME_full.ini b/confs/conf_edt_v26_MaltParser_ME_full.ini new file mode 100644 index 00000000..514c1fd7 --- /dev/null +++ b/confs/conf_edt_v26_MaltParser_ME_full.ini @@ -0,0 +1,44 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +# required by MaltOptimizer: +remove_metadata = True +output_dir = edt_2.6/preannotated/morph_extended_no_metadata + +[join_morph_extended_train_full] +input_dir = edt_2.6/preannotated/morph_extended_no_metadata +concatenate = train, dev +output_dir = edt_2.6/preannotated/morph_extended_no_metadata + +[maltoptimize_morph_extended_full] +input_files = edt_2.6/preannotated/morph_extended_no_metadata/et_edt-ud-dev-morph_extended.conllu +output_dir = edt_2.6/trained/morph_extended/maltparser_full_data + +[train_malt_morph_extended_full] +experiment_type = full_data +input_dir = edt_2.6/preannotated/morph_extended_no_metadata +train_file = edt_2.6/preannotated/morph_extended_no_metadata/et_edt-ud-train-morph_extended.conllu +final_options_file = edt_2.6/trained/morph_extended/maltparser_full_data/finalOptionsFile.xml +feature_model_file = edt_2.6/trained/morph_extended/maltparser_full_data/featureFile.xml +output_dir = edt_2.6/trained/morph_extended/maltparser_full_data +model_file = model_morph_extended.mco + +[predict_malt_morph_extended_full] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_extended_no_metadata/train_full.conllu +test_file = edt_2.6/preannotated/morph_extended_no_metadata/et_edt-ud-test-morph_extended.conllu +model_file = edt_2.6/trained/morph_extended/maltparser_full_data/model_morph_extended.mco +output_file_prefix = predicted_malt_ +output_dir = edt_2.6/evaluation/morph_extended/maltparser_full_data + +[eval_malt_morph_extended_full] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended_no_metadata/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended_no_metadata/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended/maltparser_full_data/predicted_malt_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/maltparser_full_data/predicted_malt_test.conllu \ No newline at end of file diff --git a/confs/conf_edt_v26_Stanza_MA_full.ini b/confs/conf_edt_v26_Stanza_MA_full.ini new file mode 100644 index 00000000..e0cc6322 --- /dev/null +++ b/confs/conf_edt_v26_Stanza_MA_full.ini @@ -0,0 +1,59 @@ +[preannotation_morph_analysis] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_analysis +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_analysis + +[join_morph_analysis_train_full] +input_dir = edt_2.6/preannotated/morph_analysis +concatenate = train, dev +output_dir = edt_2.6/preannotated/morph_analysis + +[train_stanza_morph_analysis_full] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_analysis/et_edt-ud-train-morph_analysis.conllu +eval_file = edt_2.6/preannotated/morph_analysis/et_edt-ud-dev-morph_analysis.conllu +args = --batch_size 5000 +output_dir = edt_2.6/trained/morph_analysis/stanza_full_data +model_file = model_morph_analysis.pt +predict_after = True +#dry_run = True + +[predict_stanza_morph_analysis_full] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_analysis/train_full.conllu +test_file = edt_2.6/preannotated/morph_analysis/et_edt-ud-test-morph_analysis.conllu +model_file = edt_2.6/trained/morph_analysis/stanza_full_data/model_morph_analysis.pt +output_dir = edt_2.6/evaluation/morph_analysis/stanza_full_data +#dry_run = True + +#[predict_stanza_tagger_morph_analysis_full] +#experiment_type = full_data +#train_file = edt_2.6/preannotated/morph_analysis/train_full.conllu +#test_file = edt_2.6/preannotated/morph_analysis/et_edt-ud-test-morph_analysis.conllu +#model_file = edt_2.6/trained/morph_analysis/stanza_full_data/model_morph_analysis.pt +#use_estnltk = True +#seed = 43 +#morph_layer = morph_analysis +#output_file_prefix = predicted_stanzatagger_ +#output_dir = edt_2.6/evaluation/morph_analysis/stanza_full_data + +[eval_stanza_morph_analysis_full_data] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_analysis/train_full.conllu +gold_test = edt_2.6/preannotated/morph_analysis/et_edt-ud-test-morph_analysis.conllu +predicted_train = edt_2.6/evaluation/morph_analysis/stanza_full_data/predicted_train.conllu +predicted_test = edt_2.6/evaluation/morph_analysis/stanza_full_data/predicted_test.conllu +count_words = True + +#[eval_stanza_tagger_morph_analysis_full_data] +#experiment_type = full_data +#gold_train = edt_2.6/preannotated/morph_analysis/train_full.conllu +#gold_test = edt_2.6/preannotated/morph_analysis/et_edt-ud-test-morph_analysis.conllu +#predicted_train = edt_2.6/evaluation/morph_analysis/stanza_full_data/predicted_stanzatagger_train.conllu +#predicted_test = edt_2.6/evaluation/morph_analysis/stanza_full_data/predicted_stanzatagger_test.conllu +#count_words = True diff --git a/confs/conf_edt_v26_Stanza_ME_crossvalidation.ini b/confs/conf_edt_v26_Stanza_ME_crossvalidation.ini new file mode 100644 index 00000000..a864c339 --- /dev/null +++ b/confs/conf_edt_v26_Stanza_ME_crossvalidation.ini @@ -0,0 +1,50 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_extended + +# Make train_full (for evaluation) +[join_morph_extended_train_full] +input_dir = edt_2.6/preannotated/morph_extended +concatenate = train, dev +output_dir = edt_2.6/preannotated/morph_extended + +# we need 10 crossvalidation data splits +[split_morph_extended_crossvalidation] +input_dir = edt_2.6/preannotated/morph_extended +concatenate = train, dev +split_type = crossvalidation +block_count = 195 +split_count = 10 +seed = 9 +first_output_dir = edt_2.6/preannotated/morph_extended/crossvalidation_splits/initial_split +final_output_dir = edt_2.6/preannotated/morph_extended/crossvalidation_splits + +# train 10 crossvalidation models +[train_stanza_morph_extended_crossvalidation] +input_dir = edt_2.6/preannotated/morph_extended/crossvalidation_splits +experiment_type = crossvalidation +predict_after = True +args = --batch_size 5000 +output_dir = edt_2.6/trained/morph_extended/crossvalidation + +[predict_stanza_morph_extended_crossvalidation] +experiment_type = crossvalidation +input_dir = edt_2.6/preannotated/morph_extended/crossvalidation_splits +test_file = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +models_dir = edt_2.6/trained/morph_extended/crossvalidation +output_dir = edt_2.6/evaluation/morph_extended/stanza_crossvalidation + +# Evaluate all 10 models separately +[eval_stanza_morph_extended_crossvalidation] +experiment_type = crossvalidation +gold_splits_dir = edt_2.6/preannotated/morph_extended/crossvalidation_splits +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +predictions_dir = edt_2.6/evaluation/morph_extended/stanza_crossvalidation +macro_average = True + diff --git a/confs/conf_edt_v26_Stanza_ME_ensemble_conf_intervals.ini b/confs/conf_edt_v26_Stanza_ME_ensemble_conf_intervals.ini new file mode 100644 index 00000000..53dd3173 --- /dev/null +++ b/confs/conf_edt_v26_Stanza_ME_ensemble_conf_intervals.ini @@ -0,0 +1,50 @@ +# Required preprocessing from configurations: +# * conf_edt_v26_stanza_ME_full.ini +# * conf_edt_v26_stanza_ME_ensemble_full.ini +# * conf_edt_v26_stanza_ME_ensemble_majority_voting_full.ini +# * conf_edt_v26_Stanza_ME_half_data.ini +# * conf_edt_v26_Stanza_ME_half_data_ensemble_majority_voting.ini + +# Single model (default) +[eval_stanza_morph_extended_full_data_conf_intervals] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended/stanza_full_data/predicted_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_full_data/predicted_test.conllu +add_conf_intervals = True + +# Full data ensemble +[eval_stanza_ensemble_morph_extended_full_data_las_coherence_conf_intervals] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +add_conf_intervals = True +predicted_train = edt_2.6/evaluation/morph_extended/stanza_ensemble_full_data/predicted_stanzaensembletagger_default_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_ensemble_full_data/predicted_stanzaensembletagger_default_test.conllu +#output_csv_file = edt_2.6/results_stanza_ME_conf_intervals.csv + +[eval_stanza_ensemble_morph_extended_full_data_majority_voting_conf_intervals] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +add_conf_intervals = True +predicted_train = edt_2.6/evaluation/morph_extended/stanza_ensemble_full_data/predicted_stanzaensembletagger_majority_voting_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_ensemble_full_data/predicted_stanzaensembletagger_majority_voting_test.conllu + +# Half data ensemble +[eval_stanza_ensemble_morph_extended_half_data_las_coherence_conf_intervals] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +add_conf_intervals = True +predicted_train = edt_2.6/evaluation/morph_extended/stanza_ensemble_half_data/predicted_stanzaensembletagger_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_ensemble_half_data/predicted_stanzaensembletagger_test.conllu + +[eval_stanza_ensemble_morph_extended_half_data_majority_voting_conf_intervals] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +add_conf_intervals = True +predicted_train = edt_2.6/evaluation/morph_extended/stanza_ensemble_half_data/predicted_stanzaensembletagger_majority_voting_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_ensemble_half_data/predicted_stanzaensembletagger_majority_voting_test.conllu \ No newline at end of file diff --git a/confs/conf_edt_v26_Stanza_ME_ensemble_entropy.ini b/confs/conf_edt_v26_Stanza_ME_ensemble_entropy.ini new file mode 100644 index 00000000..90f35c02 --- /dev/null +++ b/confs/conf_edt_v26_Stanza_ME_ensemble_entropy.ini @@ -0,0 +1,43 @@ +# Required preprocessing from configurations: +# * conf_edt_v26_stanza_ME_ensemble_full.ini +# * conf_edt_v26_stanza_ME_ensemble_majority_voting_full.ini +# * conf_edt_v26_Stanza_ME_half_data.ini +# * conf_edt_v26_Stanza_ME_half_data_ensemble_majority_voting.ini + +# Full data ensemble +[eval_stanza_ensemble_morph_extended_full_data_las_coherence_entropy] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +calc_scores_with_entropy = True +count_words = True +predicted_train = edt_2.6/evaluation/morph_extended/stanza_ensemble_full_data/predicted_stanzaensembletagger_default_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_ensemble_full_data/predicted_stanzaensembletagger_default_test.conllu + +[eval_stanza_ensemble_morph_extended_full_data_majority_voting_entropy] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +calc_scores_with_entropy = True +count_words = True +predicted_train = edt_2.6/evaluation/morph_extended/stanza_ensemble_full_data/predicted_stanzaensembletagger_majority_voting_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_ensemble_full_data/predicted_stanzaensembletagger_majority_voting_test.conllu + +# Half data ensemble +[eval_stanza_ensemble_morph_extended_half_data_las_coherence_entropy] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +calc_scores_with_entropy = True +count_words = True +predicted_train = edt_2.6/evaluation/morph_extended/stanza_ensemble_half_data/predicted_stanzaensembletagger_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_ensemble_half_data/predicted_stanzaensembletagger_test.conllu + +[eval_stanza_ensemble_morph_extended_half_data_majority_voting_entropy] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +calc_scores_with_entropy = True +count_words = True +predicted_train = edt_2.6/evaluation/morph_extended/stanza_ensemble_half_data/predicted_stanzaensembletagger_majority_voting_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_ensemble_half_data/predicted_stanzaensembletagger_majority_voting_test.conllu \ No newline at end of file diff --git a/confs/conf_edt_v26_Stanza_ME_full.ini b/confs/conf_edt_v26_Stanza_ME_full.ini new file mode 100644 index 00000000..4b0dea4b --- /dev/null +++ b/confs/conf_edt_v26_Stanza_ME_full.ini @@ -0,0 +1,81 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_extended + +[join_morph_extended_train_full] +input_dir = edt_2.6/preannotated/morph_extended +concatenate = train, dev +output_dir = edt_2.6/preannotated/morph_extended + +[train_stanza_morph_extended_full] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_extended/et_edt-ud-train-morph_extended.conllu +eval_file = edt_2.6/preannotated/morph_extended/et_edt-ud-dev-morph_extended.conllu +args = --batch_size 5000 +output_dir = edt_2.6/trained/morph_extended/stanza_full_data +model_file = model_morph_extended.pt +predict_after = True +#dry_run = True + +[predict_stanza_morph_extended_full] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_extended/train_full.conllu +test_file = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +model_file = edt_2.6/trained/morph_extended/stanza_full_data/model_morph_extended.pt +output_dir = edt_2.6/evaluation/morph_extended/stanza_full_data +#dry_run = True + +[predict_stanza_tagger_morph_extended_full] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_extended/train_full.conllu +test_file = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +model_file = edt_2.6/trained/morph_extended/stanza_full_data/model_morph_extended.pt +use_estnltk = True +seed = 43 +morph_layer = morph_extended +output_file_prefix = predicted_stanzatagger_ +output_dir = edt_2.6/evaluation/morph_extended/stanza_full_data + +[eval_stanza_morph_extended_full_data] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended/stanza_full_data/predicted_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_full_data/predicted_test.conllu +count_words = True + +[eval_stanza_morph_extended_full_data_conf_intervals] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended/stanza_full_data/predicted_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_full_data/predicted_test.conllu +count_words = True +add_conf_intervals = True +output_csv_file = edt_2.6/results_stanza_ME_full_conf_intervals.csv + +#[eval_stanza_tagger_morph_extended_full_data] +#experiment_type = full_data +#gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +#gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +#predicted_train = edt_2.6/evaluation/morph_extended/stanza_full_data/predicted_stanzatagger_train.conllu +#predicted_test = edt_2.6/evaluation/morph_extended/stanza_full_data/predicted_stanzatagger_test.conllu +#count_words = True + +[eval_stanza_tagger_morph_extended_full_data_error_types] +experiment_type = full_data +count_error_types = True +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended/stanza_full_data/predicted_stanzatagger_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_full_data/predicted_stanzatagger_test.conllu +name = stanza_ME_error_types +skip_train = True +count_words = True +output_csv_file = edt_2.6/results_stanza_ME_error_types.csv diff --git a/confs/conf_edt_v26_Stanza_ME_full_clauses.ini b/confs/conf_edt_v26_Stanza_ME_full_clauses.ini new file mode 100644 index 00000000..dc06d1e1 --- /dev/null +++ b/confs/conf_edt_v26_Stanza_ME_full_clauses.ini @@ -0,0 +1,58 @@ +# * Prepares clauses ME dataset +# * Trains model on clauses ME dataset and also applies on clauses ME dataset +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_extended + +[extract_clauses_morph_extended] +input_dir = edt_2.6/preannotated/morph_extended +skip_list = train_full.conllu +output_dir = edt_2.6/preannotated/morph_extended_clauses + +[join_morph_extended_clauses] +input_dir = edt_2.6/preannotated/morph_extended_clauses +concatenate = train, dev +output_dir = edt_2.6/preannotated/morph_extended_clauses + +[train_stanza_morph_extended_on_clauses] +input_dir = edt_2.6/preannotated/morph_extended_clauses +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_extended_clauses/et_edt-ud-train-morph_extended.conllu +eval_file = edt_2.6/preannotated/morph_extended_clauses/et_edt-ud-dev-morph_extended.conllu +args = --batch_size 5000 +output_dir = edt_2.6/trained/morph_extended_clauses/stanza_full_data +model_file = model_morph_extended_clauses.pt +predict_after = True + +[predict_stanza_morph_extended_on_clauses] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_extended_clauses/train_full.conllu +test_file = edt_2.6/preannotated/morph_extended_clauses/et_edt-ud-test-morph_extended.conllu +model_file = edt_2.6/trained/morph_extended_clauses/stanza_full_data/model_morph_extended_clauses.pt +output_file_prefix = predicted_ +output_dir = edt_2.6/evaluation/morph_extended_clauses/stanza_full_data + +[eval_stanza_morph_extended_on_clauses] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended_clauses/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended_clauses/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended_clauses/stanza_full_data/predicted_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended_clauses/stanza_full_data/predicted_test.conllu + +[eval_stanza_morph_extended_on_clauses_error_types] +experiment_type = full_data +count_error_types = True +gold_train = edt_2.6/preannotated/morph_extended_clauses/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended_clauses/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended_clauses/stanza_full_data/predicted_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended_clauses/stanza_full_data/predicted_test.conllu +name = stanza_ME_on_clauses_error_types +skip_train = True +count_words = True +output_csv_file = edt_2.6/results_stanza_ME_on_clauses_error_types.csv \ No newline at end of file diff --git a/confs/conf_edt_v26_Stanza_ME_full_predict_clauses.ini b/confs/conf_edt_v26_Stanza_ME_full_predict_clauses.ini new file mode 100644 index 00000000..d29eae45 --- /dev/null +++ b/confs/conf_edt_v26_Stanza_ME_full_predict_clauses.ini @@ -0,0 +1,67 @@ +# * Prepares clauses ME dataset and full ME dataset +# * Trains model on full ME dataset and applies on clauses ME dataset +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_extended + +# full ME dataset +[join_morph_extended_train_full] +input_dir = edt_2.6/preannotated/morph_extended +concatenate = train, dev +output_dir = edt_2.6/preannotated/morph_extended + +# clauses ME dataset +[extract_clauses_morph_extended] +input_dir = edt_2.6/preannotated/morph_extended +skip_list = train_full.conllu +output_dir = edt_2.6/preannotated/morph_extended_clauses + +[join_morph_extended_clauses] +input_dir = edt_2.6/preannotated/morph_extended_clauses +concatenate = train, dev +output_dir = edt_2.6/preannotated/morph_extended_clauses + +# train on full ME dataset +[train_stanza_morph_extended_full] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_extended/et_edt-ud-train-morph_extended.conllu +eval_file = edt_2.6/preannotated/morph_extended/et_edt-ud-dev-morph_extended.conllu +args = --batch_size 5000 +output_dir = edt_2.6/trained/morph_extended/stanza_full_data +model_file = model_morph_extended.pt +predict_after = True +#dry_run = True + +# predict with full ME model on clauses ME dataset +[predict_stanza_morph_extended_full_on_clauses] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_extended_clauses/train_full.conllu +test_file = edt_2.6/preannotated/morph_extended_clauses/et_edt-ud-test-morph_extended.conllu +model_file = edt_2.6/trained/morph_extended/stanza_full_data/model_morph_extended.pt +output_file_prefix = predicted_ +output_dir = edt_2.6/evaluation/morph_extended_clauses/stanza_full_data_not_trained_on_clauses + +[eval_stanza_morph_extended_full_predict_on_clauses] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended_clauses/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended_clauses/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended_clauses/stanza_full_data_not_trained_on_clauses/predicted_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended_clauses/stanza_full_data_not_trained_on_clauses/predicted_test.conllu + +[eval_stanza_morph_extended_full_predict_on_clauses_error_types] +experiment_type = full_data +count_error_types = True +gold_train = edt_2.6/preannotated/morph_extended_clauses/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended_clauses/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended_clauses/stanza_full_data_not_trained_on_clauses/predicted_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended_clauses/stanza_full_data_not_trained_on_clauses/predicted_test.conllu +name = stanza_ME_full_on_clauses_error_types +skip_train = True +count_words = True +output_csv_file = edt_2.6/results_stanza_ME_full_predict_on_clauses_error_types.csv \ No newline at end of file diff --git a/confs/conf_edt_v26_Stanza_ME_gap_experiments.ini b/confs/conf_edt_v26_Stanza_ME_gap_experiments.ini new file mode 100644 index 00000000..35e12151 --- /dev/null +++ b/confs/conf_edt_v26_Stanza_ME_gap_experiments.ini @@ -0,0 +1,56 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_extended + +[modify_conllu_morph_extended_full] +input_dir = edt_2.6/preannotated/morph_extended +gap_experiments = 01_no_wordforms, + 02_no_lemmas, + 02_no_pos, + 03_no_wordforms_adj_noun_lemmas, + 04_no_wordforms_verb_adpos_lemmas, + 05_only_cg_list_wordforms_lemmas, + 06_no_wordform_lemma_pos_keep_conj, + 07_no_wordform_lemma_pos + 08_only_wordforms, + 09_only_pos_feats +output_dir = edt_2.6/preannotated/morph_extended/gap_experiments + +[train_stanza_morph_extended_gap_experiments] +input_dir = edt_2.6/preannotated/morph_extended/gap_experiments +experiment_type = multi_experiment +train_file_pat = (?P\S+)_train.conllu +dev_file_pat = (?P\S+)_dev.conllu +test_file_pat = (?P\S+)_dev.conllu +args = --batch_size 5000 +predict_after = True +output_dir = edt_2.6/trained/morph_extended/gap_experiments +#dry_run = True + +[predict_stanza_morph_extended_gap_experiments] +input_dir = edt_2.6/preannotated/morph_extended/gap_experiments +experiment_type = multi_experiment +train_file_pat = (?P\S+)_train.conllu +test_file = (?P\S+)_test.conllu +test_file_is_pattern = True +models_dir = edt_2.6/trained/morph_extended/gap_experiments +output_dir = edt_2.6/evaluation/morph_extended/gap_experiments +use_gpu = True +#dry_run = True + +[eval_stanza_morph_extended_gap_experiments] +experiment_type = multi_experiment +gold_splits_dir = edt_2.6/preannotated/morph_extended/gap_experiments +gold_test = (?P\S+)_test.conllu +test_file_is_pattern = True +train_file_pat = (?P\S+)_train.conllu +# we need to specify punct listing, because xpos/upos have been deleted ... +punct_tokens_file = background_data/edt_punct_tokens.txt +predictions_dir = edt_2.6/evaluation/morph_extended/gap_experiments +macro_average = False diff --git a/confs/conf_edt_v26_Stanza_ME_half_data.ini b/confs/conf_edt_v26_Stanza_ME_half_data.ini new file mode 100644 index 00000000..3301fc69 --- /dev/null +++ b/confs/conf_edt_v26_Stanza_ME_half_data.ini @@ -0,0 +1,69 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_extended + +# make 50%-50% data splits +[split_morph_extended_half_data] +input_dir = edt_2.6/preannotated/morph_extended +concatenate = train, dev +split_type = half_data +seed = 9 +first_output_dir = edt_2.6/preannotated/morph_extended/half_data_splits/initial_data +final_output_dir = edt_2.6/preannotated/morph_extended/half_data_splits + +# train half data models +[train_stanza_morph_extended_half_data] +input_dir = edt_2.6/preannotated/morph_extended/half_data_splits +experiment_type = half_data +train_file_pat = (?P\S+)_train.conllu +dev_file_pat = (?P\S+)_dev.conllu +predict_after = True +args = --batch_size 5000 +output_dir = edt_2.6/trained/morph_extended/stanza_half_data + +[predict_stanza_morph_extended_half_data] +experiment_type = half_data +input_dir = edt_2.6/preannotated/morph_extended/half_data_splits +train_file_pat = (?P\S+)_train_all.conllu +test_file = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +models_dir = edt_2.6/trained/morph_extended/stanza_half_data +output_dir = edt_2.6/evaluation/morph_extended/stanza_half_data + +# predict using 10 models as an ensemble +[predict_stanza_morph_extended_half_data_ensemble] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_extended/train_full.conllu +test_file = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +models_dir = edt_2.6/trained/morph_extended/stanza_half_data +#tagger_path = ensemble_tagger.StanzaSyntaxEnsembleTagger +tagger_path = ensemble_tagger_with_entropy.StanzaSyntaxEnsembleTaggerWithEntropy +use_estnltk = True +use_ensemble = True +seed = 43 +scores_seed = 3 +use_gpu = True +morph_layer = morph_extended +output_file_prefix = predicted_stanzaensembletagger_ +output_dir = edt_2.6/evaluation/morph_extended/stanza_ensemble_half_data + +# Evaluate all 10 models separately +[eval_stanza_morph_extended_half_data] +experiment_type = crossvalidation +gold_splits_dir = edt_2.6/preannotated/morph_extended/half_data_splits +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +predictions_dir = edt_2.6/evaluation/morph_extended/stanza_half_data +macro_average = True + +# Evaluate the ensemble model +[eval_stanza_morph_extended_half_data_ensemble] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended/stanza_ensemble_half_data/predicted_stanzaensembletagger_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_ensemble_half_data/predicted_stanzaensembletagger_test.conllu \ No newline at end of file diff --git a/confs/conf_edt_v26_Stanza_ME_half_data_ensemble_majority_voting.ini b/confs/conf_edt_v26_Stanza_ME_half_data_ensemble_majority_voting.ini new file mode 100644 index 00000000..6b627072 --- /dev/null +++ b/confs/conf_edt_v26_Stanza_ME_half_data_ensemble_majority_voting.ini @@ -0,0 +1,54 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_extended + +# make 50%-50% data splits +[split_morph_extended_half_data] +input_dir = edt_2.6/preannotated/morph_extended +concatenate = train, dev +split_type = half_data +seed = 9 +first_output_dir = edt_2.6/preannotated/morph_extended/half_data_splits/initial_data +final_output_dir = edt_2.6/preannotated/morph_extended/half_data_splits + +# train half data models +[train_stanza_morph_extended_half_data] +input_dir = edt_2.6/preannotated/morph_extended/half_data_splits +experiment_type = half_data +train_file_pat = (?P\S+)_train.conllu +dev_file_pat = (?P\S+)_dev.conllu +predict_after = True +args = --batch_size 5000 +output_dir = edt_2.6/trained/morph_extended/stanza_half_data + +# predict using 10 models as an ensemble +[predict_stanza_morph_extended_half_data_ensemble_majority_voting] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_extended/train_full.conllu +test_file = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +models_dir = edt_2.6/trained/morph_extended/stanza_half_data +#tagger_path = ensemble_tagger.StanzaSyntaxEnsembleTagger +tagger_path = ensemble_tagger_with_entropy.StanzaSyntaxEnsembleTaggerWithEntropy +use_estnltk = True +use_ensemble = True +use_majority_voting = True +seed = 43 +scores_seed = 3 +use_gpu = True +morph_layer = morph_extended +output_file_prefix = predicted_stanzaensembletagger_majority_voting_ +output_dir = edt_2.6/evaluation/morph_extended/stanza_ensemble_half_data + +# Evaluate the ensemble model +[eval_stanza_morph_extended_half_data_ensemble_majority_voting] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended/stanza_ensemble_half_data/predicted_stanzaensembletagger_majority_voting_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_ensemble_half_data/predicted_stanzaensembletagger_majority_voting_test.conllu \ No newline at end of file diff --git a/confs/conf_edt_v26_Stanza_ME_sketches_knockout_5_groups.ini b/confs/conf_edt_v26_Stanza_ME_sketches_knockout_5_groups.ini new file mode 100644 index 00000000..21a0aaec --- /dev/null +++ b/confs/conf_edt_v26_Stanza_ME_sketches_knockout_5_groups.ini @@ -0,0 +1,94 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_extended + +[extract_clauses_morph_extended] +input_dir = edt_2.6/preannotated/morph_extended +skip_list = train_full.conllu +output_dir = edt_2.6/preannotated/morph_extended_clauses + +[make_sketches_table_morph_extended_top_50] +input_dir = edt_2.6/preannotated/morph_extended_clauses +top_n = 50 +output_csv_file = edt_2.6/preannotated/morph_extended_clauses/top_50_sketches.csv + +[prepare_knockout_morph_extended_top_50_sketches_5groups] +input_dir = edt_2.6/preannotated/morph_extended_clauses +top_sketches_file = edt_2.6/preannotated/morph_extended_clauses/top_50_sketches.csv +min_support = 50 +# distribute top_n_sketches randomly into 5 roughly same size groups +# prepare knockout on groups +random_groups = 5 +grouping_seed = 1 +initial_output_dir = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_initial +final_output_dir = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_splits + +[train_stanza_morph_extended_sketches_knockout_5groups] +input_dir = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_splits +experiment_type = multi_experiment +train_file_pat = train_(?Pgroup\d+).conllu +dev_file_pat = dev_(?Pgroup\d+).conllu +test_file_pat = dev_(?Pgroup\d+).conllu +args = --batch_size 5000 +predict_after = False +output_dir = edt_2.6/trained/morph_extended_clauses/knockout_5groups +#dry_run = True + +[predict_stanza_morph_extended_sketches_knockout_5groups_matrix] +input_dir = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_splits +experiment_type = multi_experiment +train_file_pat = train_(?Pgroup\d+).conllu +test_file = test_(?Pgroup\d+).conllu +test_file_is_pattern = True +# test matrix: evaluate all models on all test files +test_matrix = True +output_file_prefix = predicted_matrix_ +models_dir = edt_2.6/trained/morph_extended_clauses/knockout_5groups +output_dir = edt_2.6/evaluation/morph_extended_clauses/knockout_5groups +use_gpu = True +#dry_run = True + +[predict_stanza_morph_extended_sketches_knockout_5groups_single_test] +input_dir = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_splits +experiment_type = multi_experiment +train_file_pat = train_(?Pgroup\d+).conllu +test_file = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_initial/test_50x50.conllu +test_file_is_pattern = False +models_dir = edt_2.6/trained/morph_extended_clauses/knockout_5groups +output_dir = edt_2.6/evaluation/morph_extended_clauses/knockout_5groups +use_gpu = True +#dry_run = True + +[eval_stanza_morph_extended_sketches_knockout_5groups_matrix] +experiment_type = multi_experiment +gold_splits_dir = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_splits +gold_test = test_(?Pgroup\d+).conllu +test_file_is_pattern = True +train_file_pat = train_(?Pgroup\d+).conllu +test_matrix = True +skip_train = True +predictions_prefix = predicted_matrix_ +punct_tokens_file = background_data/edt_punct_tokens.txt +predictions_dir = edt_2.6/evaluation/morph_extended_clauses/knockout_5groups +macro_average = False +output_csv_file = edt_2.6/results_stanza_ME_sketches_5groups_knockout_matrix.csv + +[eval_stanza_morph_extended_sketches_knockout_5groups_single_file] +experiment_type = multi_experiment +gold_splits_dir = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_splits +gold_test = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_initial/test_50x50.conllu +test_file_is_pattern = False +train_file_pat = train_(?Pgroup\d+).conllu +test_matrix = False +skip_train = True +predictions_prefix = predicted_ +punct_tokens_file = background_data/edt_punct_tokens.txt +predictions_dir = edt_2.6/evaluation/morph_extended_clauses/knockout_5groups +macro_average = False +output_csv_file = edt_2.6/results_stanza_ME_sketches_5groups_knockout.csv \ No newline at end of file diff --git a/confs/conf_edt_v26_Stanza_ME_sketches_knockout_5_groups_random.ini b/confs/conf_edt_v26_Stanza_ME_sketches_knockout_5_groups_random.ini new file mode 100644 index 00000000..43d611f1 --- /dev/null +++ b/confs/conf_edt_v26_Stanza_ME_sketches_knockout_5_groups_random.ini @@ -0,0 +1,97 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_extended + +[extract_clauses_morph_extended] +input_dir = edt_2.6/preannotated/morph_extended +skip_list = train_full.conllu +output_dir = edt_2.6/preannotated/morph_extended_clauses + +[make_sketches_table_morph_extended_top_50] +input_dir = edt_2.6/preannotated/morph_extended_clauses +top_n = 50 +output_csv_file = edt_2.6/preannotated/morph_extended_clauses/top_50_sketches.csv + +[prepare_knockout_morph_extended_top_50_sketches_5randomgroups] +input_dir = edt_2.6/preannotated/morph_extended_clauses +top_sketches_file = edt_2.6/preannotated/morph_extended_clauses/top_50_sketches.csv +min_support = 50 +# distribute top_n_sketches randomly into 5 roughly same size groups +random_groups = 5 +grouping_seed = 1 +# Create control dataset: chose same amount of clauses as in knockout sketches randomly +# prepare knockout +create_control = True +create_control_seed = 5 +initial_output_dir = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_initial +final_output_dir = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_random + +[train_stanza_morph_extended_sketches_knockout_5randomgroups] +input_dir = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_random +experiment_type = multi_experiment +train_file_pat = train_(?Pgroup\d+).conllu +dev_file_pat = dev_(?Pgroup\d+).conllu +test_file_pat = dev_(?Pgroup\d+).conllu +args = --batch_size 5000 +predict_after = False +output_dir = edt_2.6/trained/morph_extended_clauses/knockout_5groups_random +#dry_run = True + +[predict_stanza_morph_extended_sketches_knockout_5randomgroups_matrix] +input_dir = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_random +experiment_type = multi_experiment +train_file_pat = train_(?Pgroup\d+).conllu +test_file = test_(?Pgroup\d+).conllu +test_file_is_pattern = True +# test matrix: evaluate all models on all test files +test_matrix = True +output_file_prefix = predicted_matrix_ +models_dir = edt_2.6/trained/morph_extended_clauses/knockout_5groups_random +output_dir = edt_2.6/evaluation/morph_extended_clauses/knockout_5groups_random +use_gpu = True +#dry_run = True + +[predict_stanza_morph_extended_sketches_knockout_5randomgroups_single_test] +input_dir = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_random +experiment_type = multi_experiment +train_file_pat = train_(?Pgroup\d+).conllu +test_file = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_initial/test_50x50.conllu +test_file_is_pattern = False +models_dir = edt_2.6/trained/morph_extended_clauses/knockout_5groups_random +output_dir = edt_2.6/evaluation/morph_extended_clauses/knockout_5groups_random +use_gpu = True +#dry_run = True + +[eval_stanza_morph_extended_sketches_knockout_5randomgroups_matrix] +experiment_type = multi_experiment +gold_splits_dir = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_random +gold_test = test_(?Pgroup\d+).conllu +test_file_is_pattern = True +train_file_pat = train_(?Pgroup\d+).conllu +test_matrix = True +skip_train = True +predictions_prefix = predicted_matrix_ +punct_tokens_file = background_data/edt_punct_tokens.txt +predictions_dir = edt_2.6/evaluation/morph_extended_clauses/knockout_5groups_random +macro_average = False +output_csv_file = edt_2.6/results_stanza_ME_sketches_5randomgroups_knockout_matrix.csv + +[eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file] +experiment_type = multi_experiment +gold_splits_dir = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_random +gold_test = edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_initial/test_50x50.conllu +test_file_is_pattern = False +train_file_pat = train_(?Pgroup\d+).conllu +test_matrix = False +skip_train = True +predictions_prefix = predicted_ +punct_tokens_file = background_data/edt_punct_tokens.txt +predictions_dir = edt_2.6/evaluation/morph_extended_clauses/knockout_5groups_random +macro_average = False +output_csv_file = edt_2.6/results_stanza_ME_sketches_5randomgroups_knockout.csv \ No newline at end of file diff --git a/confs/conf_edt_v26_Stanza_ME_smaller_data_01_only_form.ini b/confs/conf_edt_v26_Stanza_ME_smaller_data_01_only_form.ini new file mode 100644 index 00000000..c96d44ae --- /dev/null +++ b/confs/conf_edt_v26_Stanza_ME_smaller_data_01_only_form.ini @@ -0,0 +1,69 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_extended + +# we need 10 smaller data splits +[split_morph_extended_smaller_data_01] +input_dir = edt_2.6/preannotated/morph_extended +concatenate = train, dev +split_type = smaller_data +block_count = 195 +split_count = 10 +seed = 9 +first_output_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/initial_split +final_output_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/02_keep_all + +# modify train/train_all/dev files +[modify_conllu_morph_extended_smaller_data_01_train_dev] +input_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/02_keep_all +gap_experiments = 08_only_wordforms +suppress_checks = True +conll_file_pat = (?P\S+)_(train|train_all|dev).conllu +output_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/01_only_form + +# modify test file +[modify_conllu_morph_extended_smaller_data_01_test] +input_dir = edt_2.6/preannotated/morph_extended +gap_experiments = 08_only_wordforms +suppress_checks = True +conll_file_pat = \S+-(?Ptest)-\S+\.conllu +output_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits +# --> Creates: 08_only_wordforms_test_test.conllu + +[train_stanza_morph_extended_smaller_data_01] +input_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/01_only_form +experiment_type = smaller_data +train_file_pat = (?P\S+)_train.conllu +dev_file_pat = (?P\S+)_dev.conllu +test_file_pat = (?P\S+)_dev.conllu +args = --batch_size 5000 +predict_after = True +output_dir = edt_2.6/trained/morph_extended/smaller_data/01_only_form +#dry_run = True + +[predict_stanza_morph_extended_smaller_data_01] +input_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/01_only_form +experiment_type = smaller_data +train_file_pat = (?P\S+)_train_all.conllu +test_file = edt_2.6/preannotated/morph_extended/smaller_data_splits/08_only_wordforms_test_test.conllu +models_dir = edt_2.6/trained/morph_extended/smaller_data/01_only_form +output_dir = edt_2.6/evaluation/morph_extended/smaller_data/01_only_form +use_gpu = True +#dry_run = True + +[eval_stanza_morph_extended_smaller_data_01] +experiment_type = smaller_data +gold_splits_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/01_only_form +gold_test = edt_2.6/preannotated/morph_extended/smaller_data_splits/08_only_wordforms_test_test.conllu +train_file_pat = (?P\S+)_train_all.conllu +# we need to specify punct listing, because xpos/upos have been deleted ... +punct_tokens_file = background_data/edt_punct_tokens.txt +predictions_dir = edt_2.6/evaluation/morph_extended/smaller_data/01_only_form +macro_average = False +count_words = True \ No newline at end of file diff --git a/confs/conf_edt_v26_Stanza_ME_smaller_data_02_keep_all.ini b/confs/conf_edt_v26_Stanza_ME_smaller_data_02_keep_all.ini new file mode 100644 index 00000000..0d2dc432 --- /dev/null +++ b/confs/conf_edt_v26_Stanza_ME_smaller_data_02_keep_all.ini @@ -0,0 +1,50 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_extended + +# we need 10 smaller data splits +[split_morph_extended_smaller_data_02] +input_dir = edt_2.6/preannotated/morph_extended +concatenate = train, dev +split_type = smaller_data +block_count = 195 +split_count = 10 +seed = 9 +first_output_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/initial_split +final_output_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/02_keep_all + +[train_stanza_morph_extended_smaller_data_02_keep_all] +input_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/02_keep_all +experiment_type = smaller_data +train_file_pat = (?P\S+)_train.conllu +dev_file_pat = (?P\S+)_dev.conllu +test_file_pat = (?P\S+)_dev.conllu +args = --batch_size 5000 +predict_after = True +output_dir = edt_2.6/trained/morph_extended/smaller_data/02_keep_all +#dry_run = True + +[predict_stanza_morph_extended_smaller_data_02_keep_all] +input_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/02_keep_all +experiment_type = smaller_data +train_file_pat = (?P\S+)_train_all.conllu +test_file = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +models_dir = edt_2.6/trained/morph_extended/smaller_data/02_keep_all +output_dir = edt_2.6/evaluation/morph_extended/smaller_data/02_keep_all +use_gpu = True +#dry_run = True + +[eval_stanza_morph_extended_smaller_data_02_keep_all] +experiment_type = smaller_data +gold_splits_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/02_keep_all +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +train_file_pat = (?P\S+)_train_all.conllu +predictions_dir = edt_2.6/evaluation/morph_extended/smaller_data/02_keep_all +macro_average = False +count_words = True diff --git a/confs/conf_edt_v26_Stanza_ME_smaller_data_03_only_pos_feats.ini b/confs/conf_edt_v26_Stanza_ME_smaller_data_03_only_pos_feats.ini new file mode 100644 index 00000000..f7e88dff --- /dev/null +++ b/confs/conf_edt_v26_Stanza_ME_smaller_data_03_only_pos_feats.ini @@ -0,0 +1,67 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_extended + +# we need 10 smaller data splits +[split_morph_extended_smaller_data_03] +input_dir = edt_2.6/preannotated/morph_extended +concatenate = train, dev +split_type = smaller_data +block_count = 195 +split_count = 10 +seed = 9 +first_output_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/initial_split +final_output_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/02_keep_all + +# modify train/train_all/dev files +[modify_conllu_morph_extended_smaller_data_03_train_dev] +input_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/02_keep_all +gap_experiments = 09_only_pos_feats +suppress_checks = True +conll_file_pat = (?P\S+)_(train|train_all|dev).conllu +output_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/03_only_pos_feats + +# modify test file +[modify_conllu_morph_extended_smaller_data_03_test] +input_dir = edt_2.6/preannotated/morph_extended +gap_experiments = 09_only_pos_feats +suppress_checks = True +conll_file_pat = \S+-(?Ptest)-\S+\.conllu +output_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits +# --> Creates: 09_only_pos_feats_test_test.conllu + +[train_stanza_morph_extended_smaller_data_03] +input_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/03_only_pos_feats +experiment_type = smaller_data +train_file_pat = (?P\S+)_train.conllu +dev_file_pat = (?P\S+)_dev.conllu +test_file_pat = (?P\S+)_dev.conllu +args = --batch_size 5000 +predict_after = True +output_dir = edt_2.6/trained/morph_extended/smaller_data/03_only_pos_feats +#dry_run = True + +[predict_stanza_morph_extended_smaller_data_03_only_pos_feats] +input_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/03_only_pos_feats +experiment_type = smaller_data +train_file_pat = (?P\S+)_train_all.conllu +test_file = edt_2.6/preannotated/morph_extended/smaller_data_splits/09_only_pos_feats_test_test.conllu +models_dir = edt_2.6/trained/morph_extended/smaller_data/03_only_pos_feats +output_dir = edt_2.6/evaluation/morph_extended/smaller_data/03_only_pos_feats +use_gpu = True +#dry_run = True + +[eval_stanza_morph_extended_smaller_data_03_only_pos_feats] +experiment_type = smaller_data +gold_splits_dir = edt_2.6/preannotated/morph_extended/smaller_data_splits/03_only_pos_feats +gold_test = edt_2.6/preannotated/morph_extended/smaller_data_splits/09_only_pos_feats_test_test.conllu +train_file_pat = (?P\S+)_train_all.conllu +predictions_dir = edt_2.6/evaluation/morph_extended/smaller_data/03_only_pos_feats +macro_average = False +count_words = True \ No newline at end of file diff --git a/confs/conf_edt_v26_Stanza_UD_morph_Stanza_UD_syntax.ini b/confs/conf_edt_v26_Stanza_UD_morph_Stanza_UD_syntax.ini new file mode 100644 index 00000000..bd3fa4fa --- /dev/null +++ b/confs/conf_edt_v26_Stanza_UD_morph_Stanza_UD_syntax.ini @@ -0,0 +1,59 @@ +# predict with stanza's et morph tagger & lemmatizer and then train stanza syntax on it +[copy_ud_morph] +input_dir = UD_Estonian-EDT-r2.6 +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/ud_stanza_auto_morph + +[join_ud_stanza_morph_train_full] +input_dir = edt_2.6/preannotated/ud_stanza_auto_morph +concatenate = train, dev +output_dir = edt_2.6/preannotated/ud_stanza_auto_morph + +[predict_morph_feats_stanza_ud_1] +train_file = edt_2.6/preannotated/ud_stanza_auto_morph/et_edt-ud-train.conllu +test_file = edt_2.6/preannotated/ud_stanza_auto_morph/et_edt-ud-test.conllu +# we're using models from stanza's resources ver 1.4.1 (since we use stanza 1.4.2) +download_models = True +output_file_prefix = morph_predicted_ +output_dir = edt_2.6/preannotated/ud_stanza_auto_morph +use_gpu = True + +[predict_morph_feats_stanza_ud_2] +train_file = edt_2.6/preannotated/ud_stanza_auto_morph/train_full.conllu +test_file = edt_2.6/preannotated/ud_stanza_auto_morph/et_edt-ud-dev.conllu +# we're using models from stanza's resources ver 1.4.1 (since we use stanza 1.4.2) +download_models = True +output_file_prefix = morph_predicted_ +output_dir = edt_2.6/preannotated/ud_stanza_auto_morph +use_gpu = True + +[train_stanza_ud_on_stanza_auto_morph] +experiment_type = full_data +train_file = edt_2.6/preannotated/ud_stanza_auto_morph/morph_predicted_et_edt-ud-train.conllu +eval_file = edt_2.6/preannotated/ud_stanza_auto_morph/morph_predicted_et_edt-ud-dev.conllu +args = --batch_size 5000 +output_dir = edt_2.6/trained/ud_stanza_auto_morph/stanza_full_data +model_file = deprel_model_ud.pt +predict_after = True +#dry_run = True + +[predict_stanza_ud_on_stanza_auto_morph_full] +experiment_type = full_data +train_file = edt_2.6/preannotated/ud_stanza_auto_morph/morph_predicted_train_full.conllu +test_file = edt_2.6/preannotated/ud_stanza_auto_morph/morph_predicted_et_edt-ud-test.conllu +model_file = edt_2.6/trained/ud_stanza_auto_morph/stanza_full_data/deprel_model_ud.pt +output_dir = edt_2.6/evaluation/ud_stanza_auto_morph/stanza_full_data +#dry_run = True + +[eval_stanza_ud_on_stanza_auto_morph_full_data] +experiment_type = full_data +gold_train = edt_2.6/preannotated/ud_stanza_auto_morph/train_full.conllu +gold_test = edt_2.6/preannotated/ud_stanza_auto_morph/et_edt-ud-test.conllu +predicted_train = edt_2.6/evaluation/ud_stanza_auto_morph/stanza_full_data/predicted_train.conllu +predicted_test = edt_2.6/evaluation/ud_stanza_auto_morph/stanza_full_data/predicted_test.conllu + + + + diff --git a/confs/conf_edt_v26_UDPipe1_ME_full.ini b/confs/conf_edt_v26_UDPipe1_ME_full.ini new file mode 100644 index 00000000..9ae6d931 --- /dev/null +++ b/confs/conf_edt_v26_UDPipe1_ME_full.ini @@ -0,0 +1,65 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +# required by MaltOptimizer: +remove_metadata = True +output_dir = edt_2.6/preannotated/morph_extended_no_metadata + +[join_morph_extended_train_full] +input_dir = edt_2.6/preannotated/morph_extended_no_metadata +concatenate = train, dev +output_dir = edt_2.6/preannotated/morph_extended_no_metadata + +[train_udpipe1_default_morph_extended_full] +# Train UDPipe with default parameters, except learning_rate & number of iterations +experiment_type = full_data +input_dir = edt_2.6/preannotated/morph_extended_no_metadata +train_file = edt_2.6/preannotated/morph_extended_no_metadata/et_edt-ud-train-morph_extended.conllu +parser_options = learning_rate=0.01;iterations=30 +output_dir = edt_2.6/trained/morph_extended/udpipe1_full_data +model_file = model_morph_extended_default.udpipe + +[train_udpipe1_embeddings_morph_extended_full] +# Train UDPipe with embeddings +experiment_type = full_data +input_dir = edt_2.6/preannotated/morph_extended_no_metadata +train_file = edt_2.6/preannotated/morph_extended_no_metadata/et_edt-ud-train-morph_extended.conllu +parser_options = learning_rate=0.01;iterations=30 +create_embeddings_file = et_edt.skip.forms.50.vectors +output_dir = edt_2.6/trained/morph_extended/udpipe1_full_data +model_file = model_morph_extended_embed.udpipe + +[predict_udpipe1_default_morph_extended_full] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_extended_no_metadata/train_full.conllu +test_file = edt_2.6/preannotated/morph_extended_no_metadata/et_edt-ud-test-morph_extended.conllu +model_file = edt_2.6/trained/morph_extended/udpipe1_full_data/model_morph_extended_default.udpipe +output_file_prefix = predicted_udpipe1_default_ +output_dir = edt_2.6/evaluation/morph_extended/udpipe1_full_data + +[predict_udpipe1_embeddings_morph_extended_full] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_extended_no_metadata/train_full.conllu +test_file = edt_2.6/preannotated/morph_extended_no_metadata/et_edt-ud-test-morph_extended.conllu +model_file = edt_2.6/trained/morph_extended/udpipe1_full_data/model_morph_extended_embed.udpipe +output_file_prefix = predicted_udpipe1_embed_ +output_dir = edt_2.6/evaluation/morph_extended/udpipe1_full_data + +[eval_udpipe1_default_morph_extended] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended_no_metadata/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended_no_metadata/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended/udpipe1_full_data/predicted_udpipe1_default_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/udpipe1_full_data/predicted_udpipe1_default_test.conllu + +[eval_udpipe1_embeddings_morph_extended] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended_no_metadata/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended_no_metadata/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended/udpipe1_full_data/predicted_udpipe1_embed_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/udpipe1_full_data/predicted_udpipe1_embed_test.conllu diff --git a/confs/conf_edt_v26_stanza_MA_ensemble_full.ini b/confs/conf_edt_v26_stanza_MA_ensemble_full.ini new file mode 100644 index 00000000..d8f43aae --- /dev/null +++ b/confs/conf_edt_v26_stanza_MA_ensemble_full.ini @@ -0,0 +1,58 @@ +[preannotation_morph_analysis] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_analysis +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_analysis + +# Make train_full (for evaluation) +[join_morph_analysis_train_full] +input_dir = edt_2.6/preannotated/morph_analysis +concatenate = train, dev +output_dir = edt_2.6/preannotated/morph_analysis + +# we need crossvalidation data splits for ensemble models +[split_morph_analysis_crossvalidation] +input_dir = edt_2.6/preannotated/morph_analysis +concatenate = train, dev +split_type = crossvalidation +block_count = 195 +split_count = 10 +seed = 9 +first_output_dir = edt_2.6/preannotated/morph_analysis/crossvalidation_splits/initial_split +final_output_dir = edt_2.6/preannotated/morph_analysis/crossvalidation_splits + +# train 10 crossvalidation models +[train_stanza_morph_analysis_crossvalidation] +input_dir = edt_2.6/preannotated/morph_analysis/crossvalidation_splits +experiment_type = crossvalidation +predict_after = True +args = --batch_size 5000 +output_dir = edt_2.6/trained/morph_analysis/crossvalidation + +# predict using 10 models as an ensemble +[predict_stanza_ensemble_morph_analysis_default] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_analysis/train_full.conllu +test_file = edt_2.6/preannotated/morph_analysis/et_edt-ud-test-morph_analysis.conllu +models_dir = edt_2.6/trained/morph_analysis/crossvalidation +tagger_path = ensemble_tagger_with_entropy.StanzaSyntaxEnsembleTaggerWithEntropy +use_estnltk = True +use_ensemble = True +seed = 43 +scores_seed = 3 +use_gpu = True +morph_layer = morph_analysis +output_file_prefix = predicted_stanzaensembletagger_default_ +output_dir = edt_2.6/evaluation/morph_analysis/stanza_ensemble_full_data + +[eval_stanza_ensemble_morph_analysis_full_data_default] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_analysis/train_full.conllu +gold_test = edt_2.6/preannotated/morph_analysis/et_edt-ud-test-morph_analysis.conllu +predicted_train = edt_2.6/evaluation/morph_analysis/stanza_ensemble_full_data/predicted_stanzaensembletagger_default_train.conllu +predicted_test = edt_2.6/evaluation/morph_analysis/stanza_ensemble_full_data/predicted_stanzaensembletagger_default_test.conllu + diff --git a/confs/conf_edt_v26_stanza_MA_ensemble_majority_voting_full.ini b/confs/conf_edt_v26_stanza_MA_ensemble_majority_voting_full.ini new file mode 100644 index 00000000..7648932f --- /dev/null +++ b/confs/conf_edt_v26_stanza_MA_ensemble_majority_voting_full.ini @@ -0,0 +1,59 @@ +[preannotation_morph_analysis] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_analysis +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_analysis + +# Make train_full (for evaluation) +[join_morph_analysis_train_full] +input_dir = edt_2.6/preannotated/morph_analysis +concatenate = train, dev +output_dir = edt_2.6/preannotated/morph_analysis + +# we need crossvalidation data splits for ensemble models +[split_morph_analysis_crossvalidation] +input_dir = edt_2.6/preannotated/morph_analysis +concatenate = train, dev +split_type = crossvalidation +block_count = 195 +split_count = 10 +seed = 9 +first_output_dir = edt_2.6/preannotated/morph_analysis/crossvalidation_splits/initial_split +final_output_dir = edt_2.6/preannotated/morph_analysis/crossvalidation_splits + +# train 10 crossvalidation models +[train_stanza_morph_analysis_crossvalidation] +input_dir = edt_2.6/preannotated/morph_analysis/crossvalidation_splits +experiment_type = crossvalidation +predict_after = True +args = --batch_size 5000 +output_dir = edt_2.6/trained/morph_analysis/crossvalidation + +# predict using 10 models as an ensemble +[predict_stanza_ensemble_morph_analysis_majority_voting] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_analysis/train_full.conllu +test_file = edt_2.6/preannotated/morph_analysis/et_edt-ud-test-morph_analysis.conllu +models_dir = edt_2.6/trained/morph_analysis/crossvalidation +tagger_path = ensemble_tagger_with_entropy.StanzaSyntaxEnsembleTaggerWithEntropy +use_estnltk = True +use_ensemble = True +use_majority_voting = True +seed = 43 +scores_seed = 3 +use_gpu = True +morph_layer = morph_analysis +output_file_prefix = predicted_stanzaensembletagger_majority_voting_ +output_dir = edt_2.6/evaluation/morph_analysis/stanza_ensemble_full_data + +[eval_stanza_ensemble_morph_analysis_full_data_majority_voting] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_analysis/train_full.conllu +gold_test = edt_2.6/preannotated/morph_analysis/et_edt-ud-test-morph_analysis.conllu +predicted_train = edt_2.6/evaluation/morph_analysis/stanza_ensemble_full_data/predicted_stanzaensembletagger_majority_voting_train.conllu +predicted_test = edt_2.6/evaluation/morph_analysis/stanza_ensemble_full_data/predicted_stanzaensembletagger_majority_voting_test.conllu + diff --git a/confs/conf_edt_v26_stanza_ME_ensemble_full.ini b/confs/conf_edt_v26_stanza_ME_ensemble_full.ini new file mode 100644 index 00000000..7e25e43f --- /dev/null +++ b/confs/conf_edt_v26_stanza_ME_ensemble_full.ini @@ -0,0 +1,59 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_extended + +# Make train_full (for evaluation) +[join_morph_extended_train_full] +input_dir = edt_2.6/preannotated/morph_extended +concatenate = train, dev +output_dir = edt_2.6/preannotated/morph_extended + +# we need crossvalidation data splits for ensemble models +[split_morph_extended_crossvalidation] +input_dir = edt_2.6/preannotated/morph_extended +concatenate = train, dev +split_type = crossvalidation +block_count = 195 +split_count = 10 +seed = 9 +first_output_dir = edt_2.6/preannotated/morph_extended/crossvalidation_splits/initial_split +final_output_dir = edt_2.6/preannotated/morph_extended/crossvalidation_splits + +# train 10 crossvalidation models +[train_stanza_morph_extended_crossvalidation] +input_dir = edt_2.6/preannotated/morph_extended/crossvalidation_splits +experiment_type = crossvalidation +predict_after = True +args = --batch_size 5000 +output_dir = edt_2.6/trained/morph_extended/crossvalidation + +# predict using 10 models as an ensemble +[predict_stanza_ensemble_tagger_morph_extended_default] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_extended/train_full.conllu +test_file = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +models_dir = edt_2.6/trained/morph_extended/crossvalidation +#tagger_path = ensemble_tagger.StanzaSyntaxEnsembleTagger +tagger_path = ensemble_tagger_with_entropy.StanzaSyntaxEnsembleTaggerWithEntropy +use_estnltk = True +use_ensemble = True +seed = 43 +scores_seed = 3 +use_gpu = True +morph_layer = morph_extended +output_file_prefix = predicted_stanzaensembletagger_default_ +output_dir = edt_2.6/evaluation/morph_extended/stanza_ensemble_full_data + +[eval_stanza_ensemble_tagger_morph_extended_default_full_data] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended/stanza_ensemble_full_data/predicted_stanzaensembletagger_default_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_ensemble_full_data/predicted_stanzaensembletagger_default_test.conllu + diff --git a/confs/conf_edt_v26_stanza_ME_ensemble_majority_voting_full.ini b/confs/conf_edt_v26_stanza_ME_ensemble_majority_voting_full.ini new file mode 100644 index 00000000..43dbf077 --- /dev/null +++ b/confs/conf_edt_v26_stanza_ME_ensemble_majority_voting_full.ini @@ -0,0 +1,60 @@ +[preannotation_morph_extended] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = morph_extended +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/morph_extended + +# Make train_full (for evaluation) +[join_morph_extended_train_full] +input_dir = edt_2.6/preannotated/morph_extended +concatenate = train, dev +output_dir = edt_2.6/preannotated/morph_extended + +# we need crossvalidation data splits for ensemble models +[split_morph_extended_crossvalidation] +input_dir = edt_2.6/preannotated/morph_extended +concatenate = train, dev +split_type = crossvalidation +block_count = 195 +split_count = 10 +seed = 9 +first_output_dir = edt_2.6/preannotated/morph_extended/crossvalidation_splits/initial_split +final_output_dir = edt_2.6/preannotated/morph_extended/crossvalidation_splits + +# train 10 crossvalidation models +[train_stanza_morph_extended_crossvalidation] +input_dir = edt_2.6/preannotated/morph_extended/crossvalidation_splits +experiment_type = crossvalidation +predict_after = True +args = --batch_size 5000 +output_dir = edt_2.6/trained/morph_extended/crossvalidation + +# predict using 10 models as an ensemble +[predict_stanza_ensemble_tagger_morph_extended_majority_voting] +experiment_type = full_data +train_file = edt_2.6/preannotated/morph_extended/train_full.conllu +test_file = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +models_dir = edt_2.6/trained/morph_extended/crossvalidation +#tagger_path = ensemble_tagger.StanzaSyntaxEnsembleTagger +tagger_path = ensemble_tagger_with_entropy.StanzaSyntaxEnsembleTaggerWithEntropy +use_estnltk = True +use_ensemble = True +use_majority_voting = True +seed = 43 +scores_seed = 3 +use_gpu = True +morph_layer = morph_extended +output_file_prefix = predicted_stanzaensembletagger_majority_voting_ +output_dir = edt_2.6/evaluation/morph_extended/stanza_ensemble_full_data + +[eval_stanza_morph_extended_full_data_ensemble_majority_voting] +experiment_type = full_data +gold_train = edt_2.6/preannotated/morph_extended/train_full.conllu +gold_test = edt_2.6/preannotated/morph_extended/et_edt-ud-test-morph_extended.conllu +predicted_train = edt_2.6/evaluation/morph_extended/stanza_ensemble_full_data/predicted_stanzaensembletagger_majority_voting_train.conllu +predicted_test = edt_2.6/evaluation/morph_extended/stanza_ensemble_full_data/predicted_stanzaensembletagger_majority_voting_test.conllu + diff --git a/confs/conf_edt_v26_stanza_auto_ud_morph.ini b/confs/conf_edt_v26_stanza_auto_ud_morph.ini new file mode 100644 index 00000000..5bb5de36 --- /dev/null +++ b/confs/conf_edt_v26_stanza_auto_ud_morph.ini @@ -0,0 +1,43 @@ +[preannotation_ud_morph_analysis] +input_dir = UD_Estonian-EDT-r2.6 +morph_layer = ud_morph_analysis +seed = 43 +dictionarize = True +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/ud_auto_morph + +[join_ud_auto_morph_train_full] +input_dir = edt_2.6/preannotated/ud_auto_morph +concatenate = train, dev +output_dir = edt_2.6/preannotated/ud_auto_morph + +[train_stanza_ud_auto_morph] +experiment_type = full_data +train_file = edt_2.6/preannotated/ud_auto_morph/et_edt-ud-train-ud_morph_analysis.conllu +eval_file = edt_2.6/preannotated/ud_auto_morph/et_edt-ud-dev-ud_morph_analysis.conllu +args = --batch_size 5000 +output_dir = edt_2.6/trained/ud_auto_morph/stanza_full_data +model_file = model_ud_morph_analysis.pt +predict_after = True +#dry_run = True + +[predict_stanza_ud_auto_morph_full] +experiment_type = full_data +train_file = edt_2.6/preannotated/ud_auto_morph/train_full.conllu +test_file = edt_2.6/preannotated/ud_auto_morph/et_edt-ud-test-ud_morph_analysis.conllu +model_file = edt_2.6/trained/ud_auto_morph/stanza_full_data/model_ud_morph_analysis.pt +output_dir = edt_2.6/evaluation/ud_auto_morph/stanza_full_data +#dry_run = True + +[eval_stanza_ud_auto_morph_full_data] +experiment_type = full_data +gold_train = edt_2.6/preannotated/ud_auto_morph/train_full.conllu +gold_test = edt_2.6/preannotated/ud_auto_morph/et_edt-ud-test-ud_morph_analysis.conllu +predicted_train = edt_2.6/evaluation/ud_auto_morph/stanza_full_data/predicted_train.conllu +predicted_test = edt_2.6/evaluation/ud_auto_morph/stanza_full_data/predicted_test.conllu + + + + diff --git a/confs/conf_edt_v26_stanza_gold_ud_morph.ini b/confs/conf_edt_v26_stanza_gold_ud_morph.ini new file mode 100644 index 00000000..395b8644 --- /dev/null +++ b/confs/conf_edt_v26_stanza_gold_ud_morph.ini @@ -0,0 +1,41 @@ +# train stanza on gold standard ud morph annotation +[copy_ud_morph] +input_dir = UD_Estonian-EDT-r2.6 +remove_empty_nodes = True +remove_deps = True +remove_misc = True +output_dir = edt_2.6/preannotated/ud_gold_morph + +[join_ud_morph_train_full] +input_dir = edt_2.6/preannotated/ud_gold_morph +concatenate = train, dev +output_dir = edt_2.6/preannotated/ud_gold_morph + +[train_stanza_ud_gold_morph] +experiment_type = full_data +train_file = edt_2.6/preannotated/ud_gold_morph/et_edt-ud-train.conllu +eval_file = edt_2.6/preannotated/ud_gold_morph/et_edt-ud-dev.conllu +args = --batch_size 5000 +output_dir = edt_2.6/trained/ud_gold_morph/stanza_full_data +model_file = model_ud.pt +predict_after = True +#dry_run = True + +[predict_stanza_ud_gold_morph_full] +experiment_type = full_data +train_file = edt_2.6/preannotated/ud_gold_morph/train_full.conllu +test_file = edt_2.6/preannotated/ud_gold_morph/et_edt-ud-test.conllu +model_file = edt_2.6/trained/ud_gold_morph/stanza_full_data/model_ud.pt +output_dir = edt_2.6/evaluation/ud_gold_morph/stanza_full_data +#dry_run = True + +[eval_stanza_ud_gold_morph_full_data] +experiment_type = full_data +gold_train = edt_2.6/preannotated/ud_gold_morph/train_full.conllu +gold_test = edt_2.6/preannotated/ud_gold_morph/et_edt-ud-test.conllu +predicted_train = edt_2.6/evaluation/ud_gold_morph/stanza_full_data/predicted_train.conllu +predicted_test = edt_2.6/evaluation/ud_gold_morph/stanza_full_data/predicted_test.conllu + + + + diff --git a/edt_2.11/results_stanza_basic.csv b/edt_2.11/results_stanza_basic.csv new file mode 100644 index 00000000..5f9ec231 --- /dev/null +++ b/edt_2.11/results_stanza_basic.csv @@ -0,0 +1,2 @@ +experiment,LAS_test,LAS_train,LAS_gap,UAS_test,UAS_train +eval_stanza_morph_extended_full_data,0.8484,0.9247,0.0764,0.8775,0.9430 diff --git a/edt_2.6/preannotated/morph_extended_clauses/top_50_sketches.csv b/edt_2.6/preannotated/morph_extended_clauses/top_50_sketches.csv new file mode 100644 index 00000000..3c78dc12 --- /dev/null +++ b/edt_2.6/preannotated/morph_extended_clauses/top_50_sketches.csv @@ -0,0 +1,51 @@ +,sketch,support +0,[V]nsubj(L),1918 +1,[S],1891 +2,[V]nsubj(L)obl(L),979 +3,[V],883 +4,[S]cop(L)nsubj:cop(L),853 +5,[V]nsubj(L)obj(L),760 +6,[V]advmod(L)nsubj(L),663 +7,[V]obj(L),594 +8,[V]obl(L),557 +9,[V]nsubj(L)obj(L)obl(L),534 +10,[V]obj(L)obl(L),489 +11,[S]advmod(L)cop(L)nsubj:cop(L),485 +12,[V]nsubj(L)obl(P),484 +13,[V]nsubj(P),477 +14,[V]advmod(L)nsubj(L)obl(L),476 +15,[S]nmod(L),423 +16,[V]nsubj(P)obl(L),418 +17,[V]nsubj(L)obj(P),396 +18,[V]advmod(L)nsubj(L)obj(L),366 +19,[S]cop(L)nsubj:cop(P),365 +20,[V]advmod(L),357 +21,[V]nsubj(L)obl(L)obl(L),338 +22,[X],311 +23,[V]nsubj(L)xcomp(P),305 +24,[V]obj(P),291 +25,[V]aux(L)nsubj(L),283 +26,[S]cop(L)nsubj:cop(L)obl(L),267 +27,[S]nummod(L),264 +28,[V]nsubj(L)xcomp(L),258 +29,[V]nsubj(L)obl(L)obl(P),248 +30,[V]advmod(L)obj(L),242 +31,[V]nsubj(P)obj(L),236 +32,[V]aux(L)nsubj(L)obj(L),234 +33,[V]xcomp(P),217 +34,[S]amod(L)cop(L)nsubj:cop(L),213 +35,[V]nsubj(P)obl(P),212 +36,[V]obl(P),207 +37,[V]aux(L)nsubj(L)obl(L),206 +38,[V]advmod(L)obl(L),203 +39,[S]amod(L),203 +40,[V]obj(P)obl(L),195 +41,[V]nsubj(L)obj(P)obl(L),189 +42,[S]cop(L)nmod(L)nsubj:cop(L),188 +43,[V]nsubj(L)obj(L)obl(P),187 +44,[V]advmod(L)nsubj(L)obj(L)obl(L),185 +45,[V]advmod(L)nsubj(L)obl(P),184 +46,[S]advmod(L),177 +47,[V]obj(L)obl(P),177 +48,[V]advmod(L)nsubj(P),173 +49,[V]aux(L),173 diff --git a/edt_2.6/preannotated/morph_extended_clauses/top_50_sketches_5_groups.csv b/edt_2.6/preannotated/morph_extended_clauses/top_50_sketches_5_groups.csv new file mode 100644 index 00000000..3845a1b2 --- /dev/null +++ b/edt_2.6/preannotated/morph_extended_clauses/top_50_sketches_5_groups.csv @@ -0,0 +1,6 @@ +,grouped_sketches,support +0,[V]nsubj(L)obl(L);[V]nsubj(L)obl(L)obl(P);[S]advmod(L)cop(L)nsubj:cop(L);[S]cop(L)nsubj:cop(P);[V]aux(L)nsubj(L)obl(L);[V]obj(L)obl(L);[V]nsubj(L)xcomp(P);[V]obj(P)obl(L);[V]obj(P);[V]obj(L),4157 +1,[S]nmod(L);[V]advmod(L)nsubj(P);[V]aux(L)nsubj(L)obj(L);[S]amod(L);[V]advmod(L);[V];[V]nsubj(L)obj(P);[S];[V]nsubj(L)obj(P)obl(L);[V]nsubj(P)obl(L),5167 +2,[V]obj(L)obl(P);[V]nsubj(L)obj(L);[S]amod(L)cop(L)nsubj:cop(L);[V]advmod(L)obl(L);[S]cop(L)nmod(L)nsubj:cop(L);[V]advmod(L)nsubj(L)obj(L);[V]nsubj(L)obj(L)obl(P);[V]advmod(L)nsubj(L)obj(L)obl(L);[V]advmod(L)obj(L);[S]cop(L)nsubj:cop(L),3374 +3,[V]nsubj(L)obj(L)obl(L);[S]advmod(L);[S]cop(L)nsubj:cop(L)obl(L);[V]nsubj(L)obl(P);[V]nsubj(P)obl(P);[V]advmod(L)nsubj(L)obl(P);[V]nsubj(L);[V]advmod(L)nsubj(L);[V]nsubj(L)xcomp(L);[V]obl(P),4904 +4,[V]nsubj(L)obl(L)obl(L);[V]advmod(L)nsubj(L)obl(L);[V]aux(L);[X];[V]xcomp(P);[V]aux(L)nsubj(L);[S]nummod(L);[V]nsubj(P);[V]nsubj(P)obj(L);[V]obl(L),3332 diff --git a/edt_2.6/preannotated/morph_extended_clauses/top_50_sketches_5_groups_preparation_log.txt b/edt_2.6/preannotated/morph_extended_clauses/top_50_sketches_5_groups_preparation_log.txt new file mode 100644 index 00000000..6c0f49d1 --- /dev/null +++ b/edt_2.6/preannotated/morph_extended_clauses/top_50_sketches_5_groups_preparation_log.txt @@ -0,0 +1,293 @@ +Performing make_sketches_table_morph_extended_top_50 ... +et_edt-ud-dev-morph_extended.conllu | #clauses: 5708 +et_edt-ud-test-morph_extended.conllu | #clauses: 6033 +et_edt-ud-train-morph_extended.conllu | #clauses: 43966 + +#clauses total: 55707 +Writing sketches freq table into edt_2.6/preannotated/morph_extended_clauses/top_50_sketches.csv. +Performing prepare_knockout_morph_extended_top_50_sketches_5randomgroups ... +et_edt-ud-dev-morph_extended.conllu | #clauses: 5708 +et_edt-ud-test-morph_extended.conllu | #clauses: 6033 +et_edt-ud-train-morph_extended.conllu | #clauses: 43966 +[V]nsubj(L) + extracted 193 clauses from test +[S] + extracted 247 clauses from test +[V]nsubj(L)obl(L) + extracted 88 clauses from test +[V] + extracted 102 clauses from test +[S]cop(L)nsubj:cop(L) + extracted 81 clauses from test +[V]nsubj(L)obj(L) + extracted 62 clauses from test +[V]advmod(L)nsubj(L) + extracted 65 clauses from test +[V]obj(L) + extracted 65 clauses from test +[V]obl(L) + extracted 51 clauses from test +[V]nsubj(L)obj(L)obl(L) + extracted 44 clauses from test + extracted 6 clauses from train +[V]obj(L)obl(L) + extracted 49 clauses from test + extracted 1 clauses from train +[S]advmod(L)cop(L)nsubj:cop(L) + extracted 33 clauses from test + extracted 17 clauses from train +[V]nsubj(L)obl(P) + extracted 35 clauses from test + extracted 15 clauses from train +[V]nsubj(P) + extracted 52 clauses from test +[V]advmod(L)nsubj(L)obl(L) + extracted 51 clauses from test +[S]nmod(L) + extracted 36 clauses from test + extracted 14 clauses from train +[V]nsubj(P)obl(L) + extracted 43 clauses from test + extracted 7 clauses from train +[V]nsubj(L)obj(P) + extracted 44 clauses from test + extracted 6 clauses from train +[V]advmod(L)nsubj(L)obj(L) + extracted 39 clauses from test + extracted 11 clauses from train +[S]cop(L)nsubj:cop(P) + extracted 42 clauses from test + extracted 8 clauses from train +[V]advmod(L) + extracted 39 clauses from test + extracted 11 clauses from train +[V]nsubj(L)obl(L)obl(L) + extracted 36 clauses from test + extracted 14 clauses from train +[X] + extracted 50 clauses from test +[V]nsubj(L)xcomp(P) + extracted 22 clauses from test + extracted 28 clauses from train +[V]obj(P) + extracted 28 clauses from test + extracted 22 clauses from train +[V]aux(L)nsubj(L) + extracted 25 clauses from test + extracted 25 clauses from train +[S]cop(L)nsubj:cop(L)obl(L) + extracted 39 clauses from test + extracted 11 clauses from train +[S]nummod(L) + extracted 24 clauses from test + extracted 26 clauses from train +[V]nsubj(L)xcomp(L) + extracted 32 clauses from test + extracted 18 clauses from train +[V]nsubj(L)obl(L)obl(P) + extracted 23 clauses from test + extracted 27 clauses from train +[V]advmod(L)obj(L) + extracted 24 clauses from test + extracted 26 clauses from train +[V]nsubj(P)obj(L) + extracted 28 clauses from test + extracted 22 clauses from train +[V]aux(L)nsubj(L)obj(L) + extracted 25 clauses from test + extracted 25 clauses from train +[V]xcomp(P) + extracted 17 clauses from test + extracted 33 clauses from train +[S]amod(L)cop(L)nsubj:cop(L) + extracted 29 clauses from test + extracted 21 clauses from train +[V]nsubj(P)obl(P) + extracted 12 clauses from test + extracted 38 clauses from train +[V]obl(P) + extracted 12 clauses from test + extracted 38 clauses from train +[V]aux(L)nsubj(L)obl(L) + extracted 18 clauses from test + extracted 32 clauses from train +[S]amod(L) + extracted 21 clauses from test + extracted 29 clauses from train +[V]advmod(L)obl(L) + extracted 15 clauses from test + extracted 35 clauses from train +[V]obj(P)obl(L) + extracted 22 clauses from test + extracted 28 clauses from train +[V]nsubj(L)obj(P)obl(L) + extracted 19 clauses from test + extracted 31 clauses from train +[S]cop(L)nmod(L)nsubj:cop(L) + extracted 21 clauses from test + extracted 29 clauses from train +[V]nsubj(L)obj(L)obl(P) + extracted 18 clauses from test + extracted 32 clauses from train +[V]advmod(L)nsubj(L)obj(L)obl(L) + extracted 19 clauses from test + extracted 31 clauses from train +[V]advmod(L)nsubj(L)obl(P) + extracted 18 clauses from test + extracted 32 clauses from train +[V]obj(L)obl(P) + extracted 16 clauses from test + extracted 34 clauses from train +[S]advmod(L) + extracted 23 clauses from test + extracted 27 clauses from train +[V]advmod(L)nsubj(P) + extracted 15 clauses from test + extracted 35 clauses from train +[V]aux(L) + extracted 19 clauses from test + extracted 31 clauses from train + Saved 3007 test clauses into edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_initial\test_50x50.conllu. + Saved 43087 train clauses into edt_2.6/preannotated/morph_extended_clauses/knockout_5groups_initial\train_pure.conllu. + +group: 0 +[V]nsubj(L)obl(L) +[V]obj(L) +[V]obj(L)obl(L) +[S]advmod(L)cop(L)nsubj:cop(L) +[S]cop(L)nsubj:cop(P) +[V]nsubj(L)xcomp(P) +[V]obj(P) +[V]nsubj(L)obl(L)obl(P) +[V]aux(L)nsubj(L)obl(L) +[V]obj(P)obl(L) + Saved 553 test clauses into test_group0.conllu. + Removed 403 clauses and saved remaining 5305 dev clauses into dev_group0.conllu. + Removed 3201 clauses and saved remaining 39886 train clauses into train_group0.conllu. +group: 1 +[S] +[V] +[S]nmod(L) +[V]nsubj(P)obl(L) +[V]nsubj(L)obj(P) +[V]advmod(L) +[V]aux(L)nsubj(L)obj(L) +[S]amod(L) +[V]nsubj(L)obj(P)obl(L) +[V]advmod(L)nsubj(P) + Saved 749 test clauses into test_group1.conllu. + Removed 599 clauses and saved remaining 5109 dev clauses into dev_group1.conllu. + Removed 3819 clauses and saved remaining 39268 train clauses into train_group1.conllu. +group: 2 +[S]cop(L)nsubj:cop(L) +[V]nsubj(L)obj(L) +[V]advmod(L)nsubj(L)obj(L) +[V]advmod(L)obj(L) +[S]amod(L)cop(L)nsubj:cop(L) +[V]advmod(L)obl(L) +[S]cop(L)nmod(L)nsubj:cop(L) +[V]nsubj(L)obj(L)obl(P) +[V]advmod(L)nsubj(L)obj(L)obl(L) +[V]obj(L)obl(P) + Saved 543 test clauses into test_group2.conllu. + Removed 340 clauses and saved remaining 5368 dev clauses into dev_group2.conllu. + Removed 2491 clauses and saved remaining 40596 train clauses into train_group2.conllu. +group: 3 +[V]nsubj(L) +[V]advmod(L)nsubj(L) +[V]nsubj(L)obj(L)obl(L) +[V]nsubj(L)obl(P) +[S]cop(L)nsubj:cop(L)obl(L) +[V]nsubj(L)xcomp(L) +[V]nsubj(P)obl(P) +[V]obl(P) +[V]advmod(L)nsubj(L)obl(P) +[S]advmod(L) + Saved 658 test clauses into test_group3.conllu. + Removed 477 clauses and saved remaining 5231 dev clauses into dev_group3.conllu. + Removed 3769 clauses and saved remaining 39318 train clauses into train_group3.conllu. +group: 4 +[V]obl(L) +[V]nsubj(P) +[V]advmod(L)nsubj(L)obl(L) +[V]nsubj(L)obl(L)obl(L) +[X] +[V]aux(L)nsubj(L) +[S]nummod(L) +[V]nsubj(P)obj(L) +[V]xcomp(P) +[V]aux(L) + Saved 504 test clauses into test_group4.conllu. + Removed 358 clauses and saved remaining 5350 dev clauses into dev_group4.conllu. + Removed 2467 clauses and saved remaining 40620 train clauses into train_group4.conllu. + +group: 0 +[V]nsubj(L)obl(L) +[V]obj(L) +[V]obj(L)obl(L) +[S]advmod(L)cop(L)nsubj:cop(L) +[S]cop(L)nsubj:cop(P) +[V]nsubj(L)xcomp(P) +[V]obj(P) +[V]nsubj(L)obl(L)obl(P) +[V]aux(L)nsubj(L)obl(L) +[V]obj(P)obl(L) + Saved 553 test clauses into test_group0.conllu. + Removed 403 clauses randomly and saved remaining 5305 dev clauses into dev_group0.conllu. + Removed 3201 clauses randomly and saved remaining 39886 train clauses into train_group0.conllu. +group: 1 +[S] +[V] +[S]nmod(L) +[V]nsubj(P)obl(L) +[V]nsubj(L)obj(P) +[V]advmod(L) +[V]aux(L)nsubj(L)obj(L) +[S]amod(L) +[V]nsubj(L)obj(P)obl(L) +[V]advmod(L)nsubj(P) + Saved 749 test clauses into test_group1.conllu. + Removed 599 clauses randomly and saved remaining 5109 dev clauses into dev_group1.conllu. + Removed 3819 clauses randomly and saved remaining 39268 train clauses into train_group1.conllu. +group: 2 +[S]cop(L)nsubj:cop(L) +[V]nsubj(L)obj(L) +[V]advmod(L)nsubj(L)obj(L) +[V]advmod(L)obj(L) +[S]amod(L)cop(L)nsubj:cop(L) +[V]advmod(L)obl(L) +[S]cop(L)nmod(L)nsubj:cop(L) +[V]nsubj(L)obj(L)obl(P) +[V]advmod(L)nsubj(L)obj(L)obl(L) +[V]obj(L)obl(P) + Saved 543 test clauses into test_group2.conllu. + Removed 340 clauses randomly and saved remaining 5368 dev clauses into dev_group2.conllu. + Removed 2491 clauses randomly and saved remaining 40596 train clauses into train_group2.conllu. +group: 3 +[V]nsubj(L) +[V]advmod(L)nsubj(L) +[V]nsubj(L)obj(L)obl(L) +[V]nsubj(L)obl(P) +[S]cop(L)nsubj:cop(L)obl(L) +[V]nsubj(L)xcomp(L) +[V]nsubj(P)obl(P) +[V]obl(P) +[V]advmod(L)nsubj(L)obl(P) +[S]advmod(L) + Saved 658 test clauses into test_group3.conllu. + Removed 477 clauses randomly and saved remaining 5231 dev clauses into dev_group3.conllu. + Removed 3769 clauses randomly and saved remaining 39318 train clauses into train_group3.conllu. +group: 4 +[V]obl(L) +[V]nsubj(P) +[V]advmod(L)nsubj(L)obl(L) +[V]nsubj(L)obl(L)obl(L) +[X] +[V]aux(L)nsubj(L) +[S]nummod(L) +[V]nsubj(P)obj(L) +[V]xcomp(P) +[V]aux(L) + Saved 504 test clauses into test_group4.conllu. + Removed 358 clauses randomly and saved remaining 5350 dev clauses into dev_group4.conllu. + Removed 2467 clauses randomly and saved remaining 40620 train clauses into train_group4.conllu. \ No newline at end of file diff --git a/edt_2.6/results_crossvalidation.csv b/edt_2.6/results_crossvalidation.csv new file mode 100644 index 00000000..3c9db58f --- /dev/null +++ b/edt_2.6/results_crossvalidation.csv @@ -0,0 +1,13 @@ +experiment,LAS_test,LAS_train,LAS_gap,UAS_test,UAS_train +eval_stanza_morph_extended_crossvalidation_01,0.8485,0.9292,0.0807,0.8778,0.9476 +eval_stanza_morph_extended_crossvalidation_02,0.8480,0.9217,0.0737,0.8767,0.9412 +eval_stanza_morph_extended_crossvalidation_03,0.8460,0.9274,0.0813,0.8760,0.9463 +eval_stanza_morph_extended_crossvalidation_04,0.8468,0.9263,0.0795,0.8763,0.9452 +eval_stanza_morph_extended_crossvalidation_05,0.8514,0.9308,0.0793,0.8797,0.9484 +eval_stanza_morph_extended_crossvalidation_06,0.8473,0.9203,0.0730,0.8772,0.9405 +eval_stanza_morph_extended_crossvalidation_07,0.8465,0.9201,0.0735,0.8759,0.9402 +eval_stanza_morph_extended_crossvalidation_08,0.8482,0.9297,0.0815,0.8779,0.9479 +eval_stanza_morph_extended_crossvalidation_09,0.8482,0.9241,0.0758,0.8782,0.9430 +eval_stanza_morph_extended_crossvalidation_10,0.8503,0.9322,0.0819,0.8800,0.9499 +eval_stanza_morph_extended_crossvalidation_AVG,0.8481,0.9262,0.0780,0.8776,0.9450 +eval_stanza_ensemble_tagger_morph_extended_default_full_data,0.8568,0.9337,0.0769,0.8851,0.9515 diff --git a/edt_2.6/results_ensemble_conf_intervals.csv b/edt_2.6/results_ensemble_conf_intervals.csv new file mode 100644 index 00000000..5272bb8e --- /dev/null +++ b/edt_2.6/results_ensemble_conf_intervals.csv @@ -0,0 +1,6 @@ +experiment,LAS_test,LAS_train,UAS_test,UAS_train,LAS_gap +eval_stanza_morph_extended_full_data_conf_intervals,0.8401; 0.8486; 0.8572,0.9161; 0.9176; 0.9191,0.8697; 0.8782; 0.8867,0.9363; 0.9378; 0.9393,0.0760; 0.0689; 0.0619 +eval_stanza_ensemble_morph_extended_full_data_las_coherence_conf_intervals,0.8488; 0.8568; 0.8649,0.9327; 0.9337; 0.9347,0.8774; 0.8852; 0.8929,0.9505; 0.9515; 0.9525,0.0839; 0.0769; 0.0698 +eval_stanza_ensemble_morph_extended_full_data_majority_voting_conf_intervals,0.8487; 0.8564; 0.8641,0.9327; 0.9337; 0.9347,0.8775; 0.8849; 0.8924,0.9507; 0.9517; 0.9527,0.0840; 0.0773; 0.0707 +eval_stanza_ensemble_morph_extended_half_data_las_coherence_conf_intervals,0.8383; 0.8447; 0.8510,0.9134; 0.9152; 0.9169,0.8699; 0.8761; 0.8822,0.9325; 0.9344; 0.9363,0.0751; 0.0705; 0.0659 +eval_stanza_ensemble_morph_extended_half_data_majority_voting_conf_intervals,0.8386; 0.8449; 0.8512,0.9109; 0.9126; 0.9143,0.8699; 0.8762; 0.8826,0.9306; 0.9325; 0.9344,0.0722; 0.0677; 0.0631 diff --git a/edt_2.6/results_ensemble_entropy.csv b/edt_2.6/results_ensemble_entropy.csv new file mode 100644 index 00000000..bdb16004 --- /dev/null +++ b/edt_2.6/results_ensemble_entropy.csv @@ -0,0 +1,5 @@ +experiment,LAS_test,LAS_train,UAS_test,UAS_train,LAS_gap,test_LAS_0_entropy,test_LAS_0_entropy_matches,test_LAS_0_entropy_total_words,test_LAS_0_entropy_total_words_%,test_LAS_vs_entropy_corr,test_LAS_vs_entropy_corr_pvalue,train_LAS_0_entropy,train_LAS_0_entropy_matches,train_LAS_0_entropy_total_words,train_LAS_vs_entropy_corr,train_LAS_vs_entropy_corr_pvalue,train_LAS_0_entropy_total_words_%,test_words,train_words +eval_stanza_ensemble_morph_extended_full_data_las_coherence_entropy,0.8568,0.9337,0.8851,0.9515,0.0769,0.9256,36904,39870,82.2214,0.4431,0.0000,0.9699,322359,332368,0.3948,0.0000,85.3806,48491,389278 +eval_stanza_ensemble_morph_extended_full_data_majority_voting_entropy,0.8564,0.9337,0.8849,0.9517,0.0773,0.9256,36904,39870,82.2214,0.4465,0.0000,0.9699,322359,332368,0.3963,0.0000,85.3806,48491,389278 +eval_stanza_ensemble_morph_extended_half_data_las_coherence_entropy,0.8446,0.9151,0.8760,0.9344,0.0705,0.9405,33555,35679,73.5786,0.4760,0.0000,0.9831,286232,291164,0.4616,0.0000,74.7959,48491,389278 +eval_stanza_ensemble_morph_extended_half_data_majority_voting_entropy,0.8449,0.9126,0.8762,0.9325,0.0677,0.9405,33555,35679,73.5786,0.4777,0.0000,0.9831,286232,291164,0.4743,0.0000,74.7959,48491,389278 diff --git a/edt_2.6/results_ensemble_majority_voting.csv b/edt_2.6/results_ensemble_majority_voting.csv new file mode 100644 index 00000000..534458fb --- /dev/null +++ b/edt_2.6/results_ensemble_majority_voting.csv @@ -0,0 +1,3 @@ +experiment,LAS_test,LAS_train,UAS_test,UAS_train,LAS_gap +eval_stanza_morph_extended_full_data_ensemble_majority_voting,0.8564,0.9337,0.8849,0.9517,0.0773 +eval_stanza_morph_extended_half_data_ensemble_majority_voting,0.8449,0.9126,0.8762,0.9325,0.0677 diff --git a/edt_2.6/results_full_data_malt_udpipe1.csv b/edt_2.6/results_full_data_malt_udpipe1.csv new file mode 100644 index 00000000..736bb26f --- /dev/null +++ b/edt_2.6/results_full_data_malt_udpipe1.csv @@ -0,0 +1,4 @@ +experiment,LAS_test,LAS_train,LAS_gap,UAS_test,UAS_train +eval_malt_morph_extended_full,0.7186,0.8446,0.1260,0.7681,0.8715 +eval_udpipe1_default_morph_extended,0.7560,0.8501,0.0941,0.8009,0.8811 +eval_udpipe1_embeddings_morph_extended,0.7713,0.9147,0.1434,0.8135,0.9330 diff --git a/edt_2.6/results_gap_experiments.csv b/edt_2.6/results_gap_experiments.csv new file mode 100644 index 00000000..6fa08606 --- /dev/null +++ b/edt_2.6/results_gap_experiments.csv @@ -0,0 +1,11 @@ +experiment,LAS_test,LAS_train,LAS_gap,UAS_test,UAS_train +eval_stanza_morph_extended_gap_experiments_01_no_wordforms,0.8396,0.9236,0.0840,0.8734,0.9461 +eval_stanza_morph_extended_gap_experiments_02_no_lemmas,0.8483,0.9308,0.0825,0.8789,0.9496 +eval_stanza_morph_extended_gap_experiments_02_no_pos,0.8492,0.9284,0.0792,0.8781,0.9471 +eval_stanza_morph_extended_gap_experiments_03_no_wordforms_adj_noun_lemmas,0.8200,0.9109,0.0909,0.8610,0.9396 +eval_stanza_morph_extended_gap_experiments_04_no_wordforms_verb_adpos_lemmas,0.7822,0.9106,0.1285,0.8181,0.9370 +eval_stanza_morph_extended_gap_experiments_05_only_cg_list_wordforms_lemmas,0.8158,0.9153,0.0995,0.8572,0.9433 +eval_stanza_morph_extended_gap_experiments_06_no_wordform_lemma_pos_keep_conj,0.6869,0.8100,0.1230,0.7572,0.8701 +eval_stanza_morph_extended_gap_experiments_07_no_wordform_lemma_pos,0.6723,0.7648,0.0925,0.7532,0.8351 +eval_stanza_morph_extended_gap_experiments_08_only_wordforms,0.8407,0.9202,0.0795,0.8742,0.9391 +eval_stanza_morph_extended_gap_experiments_09_only_pos_feats,0.7018,0.8213,0.1195,0.7709,0.8799 diff --git a/edt_2.6/results_gold_and_auto_ud_morph.csv b/edt_2.6/results_gold_and_auto_ud_morph.csv new file mode 100644 index 00000000..97754cbe --- /dev/null +++ b/edt_2.6/results_gold_and_auto_ud_morph.csv @@ -0,0 +1,3 @@ +experiment,LAS_test,LAS_train,LAS_gap,UAS_test,UAS_train +eval_stanza_ud_auto_morph_full_data,0.8523,0.9333,0.0810,0.8808,0.9508 +eval_stanza_ud_gold_morph_full_data,0.8810,0.9380,0.0570,0.8987,0.9502 diff --git a/edt_2.6/results_half_data.csv b/edt_2.6/results_half_data.csv new file mode 100644 index 00000000..fb467fc4 --- /dev/null +++ b/edt_2.6/results_half_data.csv @@ -0,0 +1,13 @@ +experiment,LAS_test,LAS_train,LAS_gap,UAS_test,UAS_train +eval_stanza_morph_extended_half_data_001,0.8295,0.9448,0.1153,0.8625,0.9564 +eval_stanza_morph_extended_half_data_002,0.8278,0.9203,0.0925,0.8607,0.9392 +eval_stanza_morph_extended_half_data_003,0.8276,0.9453,0.1177,0.8602,0.9568 +eval_stanza_morph_extended_half_data_004,0.8265,0.9384,0.1119,0.8580,0.9521 +eval_stanza_morph_extended_half_data_005,0.8264,0.9289,0.1024,0.8601,0.9443 +eval_stanza_morph_extended_half_data_006,0.8248,0.9255,0.1007,0.8586,0.9425 +eval_stanza_morph_extended_half_data_007,0.8275,0.9271,0.0996,0.8602,0.9438 +eval_stanza_morph_extended_half_data_008,0.8287,0.9353,0.1065,0.8617,0.9495 +eval_stanza_morph_extended_half_data_009,0.8304,0.9368,0.1065,0.8627,0.9504 +eval_stanza_morph_extended_half_data_010,0.8252,0.9275,0.1023,0.8579,0.9445 +eval_stanza_morph_extended_half_data_AVG,0.8274,0.9330,0.1055,0.8603,0.9480 +eval_stanza_morph_extended_half_data_ensemble,0.8446,0.9151,0.0705,0.8760,0.9344 diff --git a/edt_2.6/results_smaller_data.csv b/edt_2.6/results_smaller_data.csv new file mode 100644 index 00000000..f065157e --- /dev/null +++ b/edt_2.6/results_smaller_data.csv @@ -0,0 +1,31 @@ +experiment,LAS_test,LAS_train,LAS_gap,UAS_test,UAS_train,test_words,train_words +eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_010,0.6864,0.9491,0.2628,0.7605,0.9591,48491,39886 +eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_020,0.7490,0.9636,0.2146,0.8004,0.9700,48491,77759 +eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_030,0.7819,0.9365,0.1546,0.8271,0.9489,48491,117642 +eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_040,0.7989,0.9366,0.1377,0.8388,0.9502,48491,157491 +eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_050,0.8152,0.9234,0.1082,0.8543,0.9406,48491,195477 +eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_060,0.8250,0.9299,0.1049,0.8595,0.9458,48491,235468 +eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_070,0.8241,0.9179,0.0938,0.8600,0.9370,48491,273422 +eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_080,0.8327,0.9194,0.0867,0.8674,0.9383,48491,311383 +eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_090,0.8343,0.9151,0.0808,0.8680,0.9362,48491,349322 +eval_stanza_morph_extended_smaller_data_01_08_only_wordforms_100,0.8377,0.9104,0.0728,0.8723,0.9320,48491,389278 +eval_stanza_morph_extended_smaller_data_02_keep_all_010,0.7509,0.9286,0.1778,0.7995,0.9402,48491,39886 +eval_stanza_morph_extended_smaller_data_02_keep_all_020,0.7895,0.9552,0.1657,0.8297,0.9630,48491,77759 +eval_stanza_morph_extended_smaller_data_02_keep_all_030,0.8062,0.9268,0.1207,0.8439,0.9440,48491,117642 +eval_stanza_morph_extended_smaller_data_02_keep_all_040,0.8209,0.9414,0.1205,0.8545,0.9542,48491,157491 +eval_stanza_morph_extended_smaller_data_02_keep_all_050,0.8299,0.9360,0.1060,0.8616,0.9505,48491,195477 +eval_stanza_morph_extended_smaller_data_02_keep_all_060,0.8409,0.9350,0.0941,0.8723,0.9505,48491,235468 +eval_stanza_morph_extended_smaller_data_02_keep_all_070,0.8363,0.9248,0.0884,0.8675,0.9428,48491,273422 +eval_stanza_morph_extended_smaller_data_02_keep_all_080,0.8435,0.9295,0.0860,0.8739,0.9463,48491,311383 +eval_stanza_morph_extended_smaller_data_02_keep_all_090,0.8463,0.9261,0.0798,0.8756,0.9437,48491,349322 +eval_stanza_morph_extended_smaller_data_02_keep_all_100,0.8466,0.9236,0.0771,0.8763,0.9422,48491,389278 +eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_010,0.6285,0.8174,0.1889,0.7121,0.8568,48491,39886 +eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_020,0.6544,0.8542,0.1997,0.7336,0.8912,48491,77759 +eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_030,0.6652,0.7847,0.1195,0.7412,0.8445,48491,117642 +eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_040,0.6730,0.7889,0.1159,0.7479,0.8471,48491,157491 +eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_050,0.6802,0.7878,0.1076,0.7536,0.8479,48491,195477 +eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_060,0.6880,0.8124,0.1244,0.7595,0.8668,48491,235468 +eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_070,0.6915,0.8000,0.1085,0.7638,0.8594,48491,273422 +eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_080,0.6939,0.8001,0.1061,0.7653,0.8575,48491,311383 +eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_090,0.6952,0.8081,0.1128,0.7658,0.8664,48491,349322 +eval_stanza_morph_extended_smaller_data_03_only_pos_feats_09_only_pos_feats_100,0.6970,0.7874,0.0904,0.7670,0.8489,48491,389278 diff --git a/edt_2.6/results_stanza_MA_ensembles.csv b/edt_2.6/results_stanza_MA_ensembles.csv new file mode 100644 index 00000000..c0502833 --- /dev/null +++ b/edt_2.6/results_stanza_MA_ensembles.csv @@ -0,0 +1,3 @@ +experiment,LAS_test,LAS_train,UAS_test,UAS_train,LAS_gap +eval_stanza_ensemble_morph_analysis_full_data_default,0.8573,0.9330,0.8851,0.9510,0.0757 +eval_stanza_ensemble_morph_analysis_full_data_majority_voting,0.8573,0.9329,0.8858,0.9510,0.0756 diff --git a/edt_2.6/results_stanza_ME_conf_intervals.csv b/edt_2.6/results_stanza_ME_conf_intervals.csv new file mode 100644 index 00000000..686185a6 --- /dev/null +++ b/edt_2.6/results_stanza_ME_conf_intervals.csv @@ -0,0 +1,2 @@ +experiment,LAS_test,LAS_train,UAS_test,UAS_train,LAS_gap,test_words,train_words +eval_stanza_morph_extended_full_data_conf_intervals,0.8401; 0.8486; 0.8572,0.9161; 0.9176; 0.9191,0.8697; 0.8782; 0.8867,0.9363; 0.9378; 0.9393,0.0760; 0.0689; 0.0619,48491,389278 diff --git a/edt_2.6/results_stanza_ME_error_types.csv b/edt_2.6/results_stanza_ME_error_types.csv new file mode 100644 index 00000000..afab0e39 --- /dev/null +++ b/edt_2.6/results_stanza_ME_error_types.csv @@ -0,0 +1,2 @@ +experiment,E1,E2,E3,E1_impact,E2_impact,E3_impact,E1_rel_error,E2_rel_error,E3_rel_error,total_no_punct,correct,gold_in_clause,gold_out_of_clause,total_words,punct,unequal_length,E1_missed_root,E2_missed_root,E3_missed_root,E2_with_E3,E2_without_E3 +stanza_ME_error_types_on_test,4821,398,1162,0.7555,0.0624,0.1821,0.1413,0.0117,0.1733,40815,34434,34108,6707,48491,7676,0,0,0,254,360,38 diff --git a/edt_2.6/results_stanza_ME_full_predict_on_clauses.csv b/edt_2.6/results_stanza_ME_full_predict_on_clauses.csv new file mode 100644 index 00000000..4459222e --- /dev/null +++ b/edt_2.6/results_stanza_ME_full_predict_on_clauses.csv @@ -0,0 +1,2 @@ +experiment,LAS_test,LAS_train,UAS_test,UAS_train,LAS_gap +eval_stanza_morph_extended_full_predict_on_clauses,0.8544,0.9076,0.8873,0.9341,0.0532 diff --git a/edt_2.6/results_stanza_ME_full_predict_on_clauses_error_types.csv b/edt_2.6/results_stanza_ME_full_predict_on_clauses_error_types.csv new file mode 100644 index 00000000..b8a28663 --- /dev/null +++ b/edt_2.6/results_stanza_ME_full_predict_on_clauses_error_types.csv @@ -0,0 +1,2 @@ +experiment,E1,E2,E3,E1_impact,E2_impact,E3_impact,E1_rel_error,E2_rel_error,E3_rel_error,total_no_punct,correct,gold_in_clause,gold_out_of_clause,total_words,punct,unequal_length,E1_missed_root,E2_missed_root,E3_missed_root +stanza_ME_full_on_clauses_error_types_on_test,4774,352,355,0.8710,0.0642,0.0648,0.1490,0.0110,0.0585,38101,32620,32034,6067,39128,1027,0,0,0,352 diff --git a/edt_2.6/results_stanza_ME_on_clauses.csv b/edt_2.6/results_stanza_ME_on_clauses.csv new file mode 100644 index 00000000..d756dba3 --- /dev/null +++ b/edt_2.6/results_stanza_ME_on_clauses.csv @@ -0,0 +1,2 @@ +experiment,LAS_test,LAS_train,LAS_gap,UAS_test,UAS_train +eval_stanza_morph_extended_on_clauses,0.8623,0.9298,0.0674,0.8924,0.9492 diff --git a/edt_2.6/results_stanza_ME_on_clauses_error_types.csv b/edt_2.6/results_stanza_ME_on_clauses_error_types.csv new file mode 100644 index 00000000..a93d2914 --- /dev/null +++ b/edt_2.6/results_stanza_ME_on_clauses_error_types.csv @@ -0,0 +1,2 @@ +experiment,E1,E2,E3,E1_impact,E2_impact,E3_impact,E1_rel_error,E2_rel_error,E3_rel_error,total_no_punct,correct,gold_in_clause,gold_out_of_clause,total_words,punct,unequal_length,E1_missed_root,E2_missed_root,E3_missed_root +stanza_ME_on_clauses_error_types_on_test,4501,327,337,0.8714,0.0633,0.0652,0.1405,0.0102,0.0555,38101,32936,32034,6067,39128,1027,0,0,0,328 diff --git a/edt_2.6/results_stanza_ME_sketches_5groups_knockout.csv b/edt_2.6/results_stanza_ME_sketches_5groups_knockout.csv new file mode 100644 index 00000000..829f9faa --- /dev/null +++ b/edt_2.6/results_stanza_ME_sketches_5groups_knockout.csv @@ -0,0 +1,6 @@ +experiment,LAS_test,UAS_test +eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group0,0.9142,0.9392 +eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group1,0.9126,0.9356 +eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group2,0.9161,0.9403 +eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group3,0.9108,0.9373 +eval_stanza_morph_extended_sketches_knockout_5groups_single_file_group4,0.9129,0.9380 diff --git a/edt_2.6/results_stanza_ME_sketches_5groups_knockout_matrix.csv b/edt_2.6/results_stanza_ME_sketches_5groups_knockout_matrix.csv new file mode 100644 index 00000000..bb43559e --- /dev/null +++ b/edt_2.6/results_stanza_ME_sketches_5groups_knockout_matrix.csv @@ -0,0 +1,6 @@ +,group0,group1,group2,group3,group4 +group0,0.8931,0.9046,0.9107,0.9139,0.9093 +group1,0.9193,0.9047,0.9189,0.9154,0.9154 +group2,0.9293,0.9250,0.9131,0.9186,0.9246 +group3,0.9136,0.9122,0.9186,0.8903,0.9168 +group4,0.9194,0.9179,0.9204,0.9204,0.8947 diff --git a/edt_2.6/results_stanza_ME_sketches_5randomgroups_knockout.csv b/edt_2.6/results_stanza_ME_sketches_5randomgroups_knockout.csv new file mode 100644 index 00000000..c90ae5cc --- /dev/null +++ b/edt_2.6/results_stanza_ME_sketches_5randomgroups_knockout.csv @@ -0,0 +1,6 @@ +experiment,LAS_test,UAS_test +eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group0,0.9169,0.9393 +eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group1,0.9154,0.9372 +eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group2,0.9169,0.9399 +eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group3,0.9148,0.9388 +eval_stanza_morph_extended_sketches_knockout_5randomgroups_single_file_group4,0.9165,0.9396 diff --git a/edt_2.6/results_stanza_ME_sketches_5randomgroups_knockout_matrix.csv b/edt_2.6/results_stanza_ME_sketches_5randomgroups_knockout_matrix.csv new file mode 100644 index 00000000..fe7fcb8e --- /dev/null +++ b/edt_2.6/results_stanza_ME_sketches_5randomgroups_knockout_matrix.csv @@ -0,0 +1,6 @@ +,group0,group1,group2,group3,group4 +group0,0.9114,0.9057,0.9150,0.9114,0.9100 +group1,0.9129,0.9176,0.9154,0.9098,0.9154 +group2,0.9198,0.9218,0.9230,0.9186,0.9198 +group3,0.9179,0.9154,0.9136,0.9161,0.9182 +group4,0.9239,0.9184,0.9184,0.9184,0.9204 diff --git a/edt_2.6/results_stanza_UD_on_auto_UD_morph.csv b/edt_2.6/results_stanza_UD_on_auto_UD_morph.csv new file mode 100644 index 00000000..79fbc75b --- /dev/null +++ b/edt_2.6/results_stanza_UD_on_auto_UD_morph.csv @@ -0,0 +1,2 @@ +experiment,LAS_test,LAS_train,UAS_test,UAS_train,LAS_gap +eval_stanza_ud_on_stanza_auto_morph_full_data,0.8519,0.9291,0.8801,0.9467,0.0772 diff --git a/edt_2.6/results_stanza_basic.csv b/edt_2.6/results_stanza_basic.csv new file mode 100644 index 00000000..82908a21 --- /dev/null +++ b/edt_2.6/results_stanza_basic.csv @@ -0,0 +1,3 @@ +experiment,LAS_test,LAS_train,LAS_gap,UAS_test,UAS_train,test_words,train_words +eval_stanza_morph_analysis_full_data,0.8507,0.9231,0.0724,0.8802,0.9422,48491,389278 +eval_stanza_morph_extended_full_data,0.8486,0.9176,0.0689,0.8782,0.9378,48491,389278 diff --git a/ensemble_tagger_with_entropy.py b/ensemble_tagger_with_entropy.py new file mode 100644 index 00000000..fd3620e6 --- /dev/null +++ b/ensemble_tagger_with_entropy.py @@ -0,0 +1,450 @@ +# +# Tags dependency syntactic analysis with an ensemble of Stanza's models and records predictions' entropy. +# Requires estnltk 1.7.2 +# + +import os +from collections import defaultdict, Counter, OrderedDict +from decimal import Decimal, getcontext +from random import Random +from typing import Tuple +import warnings + +from scipy.stats import entropy + +from estnltk import Layer +from estnltk.taggers.standard.syntax.syntax_dependency_retagger import SyntaxDependencyRetagger +from estnltk.taggers.standard.syntax.ud_validation.deprel_agreement_retagger import DeprelAgreementRetagger +from estnltk.taggers.standard.syntax.ud_validation.ud_validation_retagger import UDValidationRetagger +from estnltk.taggers import Tagger +from estnltk.downloader import get_resource_paths + +from estnltk.converters.serialisation_modules import syntax_v0 + +from estnltk_neural.taggers.syntax.stanza_tagger.common_utils import prepare_input_doc +from estnltk_neural.taggers.syntax.stanza_tagger.common_utils import feats_to_ordereddict + +class StanzaSyntaxEnsembleTaggerWithEntropy(Tagger): + """ + Tags dependency syntactic analysis with an ensemble of Stanza's models and records predictions' entropy. + The tagger assumes that the segmentation to sentences and words is completed beforehand. When using default + models, the tagger assumes that extended morph analysis is completed with VabaMorf module. + + The tagger creates a syntax layer that features Universal Dependencies dependency-tags in attribute 'deprel'. + UPOS is the same as VabaMorf's part of speech tag and feats is based on VabaMorf's forms. + + Names of layers to use can be changed using parameters sentences_layer, words_layer and input_morph_layer, + if needed. To use GPU for parsing, parameter use_gpu must be set to True. + Parameter add_parents_and_children adds attributes that contain the parent and children of a word. + + When using models which are trained on some missing conllu fields (text, lemma, upos, xpos, feats), these + fields can be omitted by assigning a list of field names to parameter `remove_fields`. Fields can also be + replaced with a chosen string by assigning a tuple containing a list of field names as first element + and string as a second to parameter `replace_fields`. + + The input morph analysis layer can be ambiguous. In that case, StanzaSyntaxEnsembleTagger picks randomly one + morph analysis for each ambiguous word, and predicts from "unambiguous" input. + Important: as a result, by default, the output will not be deterministic: for ambiguous words, you will + get different 'lemma', 'upostag', 'xpostag', 'feats' values on each run, and this also affects the results + of dependency parsing. + Ambiguity can also rise in dependency parsing: ensemble models can give multiple dependency parses with + maximum score and one parse needs to be chosen -- this is again done via a random choice (using a different + random generator). + How to make the output deterministic: you can pass a seed value for picking one analysis from ambiguous + morph analyses via constructor parameter random_pick_seed (int, default value: None), and a seed value + for choosing one dependency result from results with maximum scores via constructor parameter + random_pick_max_score_seed (int, default value: None). + Note that seed values are fixed once at creating a new instance of StanzaSyntaxEnsembleTagger, and you only + get deterministic / repeatable results if you tag texts in exactly the same order. + Note: if you want to get the same deterministic output as in previous versions of the tagger, use + random_pick_seed=5 and random_pick_max_score_seed=3. + + Aggregation algorithm. For aggregating predictions from multiple models, there are currently 2 algorithms + available. The first / default method ('las_coherence') processes input sentence-wise and calculates LAS + scores between each model's sentence prediction and all other sentence predictions. The sentence prediction + with the highest average LAS will be chosen for the output. This method ensures valid output tree structure. + The second method ('majority_voting') processes input token-wise and records predicted head & deprel frequencies + for each token in a sentence. After that, it applies Chu–Liu/Edmonds' algorithm to construct a valid syntactic + tree of the sentence over high frequency heads of each token. Any remaining ambiguities (e.g. choices between + multiple different deprels for a head) will be resolved via random choice. + You can set the aggregation algorithm via constructor parameter aggregation_algorithm. + + Tutorial: + https://github.com/estnltk/estnltk/blob/main/tutorials/nlp_pipeline/C_syntax/03_syntactic_analysis_with_stanza.ipynb + """ + + conf_param = ['add_parent_and_children', 'aggregation_algorithm', 'syntax_dependency_retagger', + 'mark_syntax_error', 'mark_agreement_error', 'agreement_error_retagger', + 'ud_validation_retagger', 'use_gpu', 'gpu_max_words_in_sentence', 'model_paths', + 'taggers', 'remove_fields', 'replace_fields', 'find_entropy', 'random_pick_seed', + '_random1', 'random_pick_max_score_seed', '_random2'] + + def __init__(self, + output_layer: str = 'stanza_ensemble_syntax', + sentences_layer: str = 'sentences', + words_layer: str = 'words', + input_morph_layer: str = 'morph_extended', + aggregation_algorithm: str = 'las_coherence', + random_pick_seed: int = None, + random_pick_max_score_seed: int = None, + remove_fields: list = None, + replace_fields: Tuple[list, str] = None, + model_paths: list = None, + add_parent_and_children: bool = False, + mark_syntax_error: bool = False, + mark_agreement_error: bool = False, + find_entropy: bool = True, + use_gpu: bool = False, + gpu_max_words_in_sentence: int = 1000 + ): + # Make an internal import to avoid explicit stanza dependency + import stanza + + self.output_layer = output_layer + self.remove_fields = remove_fields + self.replace_fields = replace_fields + self.model_paths = model_paths + self.add_parent_and_children = add_parent_and_children + self.mark_syntax_error = mark_syntax_error + self.mark_agreement_error = mark_agreement_error + self.output_attributes = ('id', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc') + if not isinstance(aggregation_algorithm, str) or \ + aggregation_algorithm.lower() not in ['las_coherence', 'majority_voting']: + raise ValueError(('(!) Unexpected aggregation_algorithm value {!r}. '+\ + 'Must be a value from set {"las_coherence", "majority_voting"}').format(aggregation_algorithm)) + self.aggregation_algorithm = aggregation_algorithm.lower() + self.use_gpu = use_gpu + # We may run into "CUDA out of memory" error when processing very long sentences + # with GPU. + # Set a reasonable default for max sentence length: if that gets exceeded, then a + # guarding exception will be thrown + self.gpu_max_words_in_sentence = gpu_max_words_in_sentence + + # Random generator for picking one analysis from ambiguous morph analyses: + self.random_pick_seed = random_pick_seed + self._random1 = Random() + if isinstance(self.random_pick_seed, int): + self._random1.seed(self.random_pick_seed) + # Random generator for choosing one dependency result from results with maximum scores: + self.random_pick_max_score_seed = random_pick_max_score_seed + self._random2 = Random() + if isinstance(self.random_pick_max_score_seed, int): + self._random2.seed(self.random_pick_max_score_seed) + + # Try to get the resources path for stanzasyntaxensembletagger. Attempt to download resources, if missing + resources_path = get_resource_paths("stanzasyntaxensembletagger", only_latest=True, download_missing=True) + if resources_path is None: + raise Exception('Models of StanzaSyntaxEnsembleTagger are missing. '+\ + 'Please use estnltk.download("stanzasyntaxensembletagger") to download the models.') + + if not model_paths: + self.model_paths = list() + ensemble_path = os.path.join(resources_path, 'et', 'depparse', 'ensemble_models') + if not os.path.isdir(ensemble_path): + raise ValueError('Missing models under the subdirectory `stanza_resources/et/depparse/ensemble_models.') + for model in os.listdir(ensemble_path): + self.model_paths.append(os.path.join(ensemble_path, model)) + + self.taggers = dict() # Save taggers + for i, model_path in enumerate(self.model_paths): + if not os.path.isfile(model_path): + raise ValueError('Invalid model path: {}'.format(model_path)) + + nlp = stanza.Pipeline(lang='et', processors='depparse', + dir=resources_path, + depparse_pretagged=True, + depparse_model_path=model_path, + use_gpu=self.use_gpu, + logging_level='WARN') + self.taggers[str(i)] = nlp + + self.input_layers = [sentences_layer, input_morph_layer, words_layer] + + self.syntax_dependency_retagger = None + if add_parent_and_children: + self.syntax_dependency_retagger = SyntaxDependencyRetagger(syntax_layer=output_layer) + self.output_attributes += ('parent_span', 'children') + + self.ud_validation_retagger = None + if mark_syntax_error: + self.ud_validation_retagger = UDValidationRetagger(output_layer=output_layer) + self.output_attributes += ('syntax_error', 'error_message') + + self.agreement_error_retagger = None + if mark_agreement_error: + if not add_parent_and_children: + raise ValueError('`add_parent_and_children` must be True for marking agreement errors.') + else: + self.agreement_error_retagger = DeprelAgreementRetagger(output_layer=output_layer) + self.output_attributes += ('agreement_deprel',) + + self.find_entropy = find_entropy + if self.find_entropy: + self.output_attributes += ('entropy', 'votes') + + def _make_layer_template(self): + """Creates and returns a template of the layer.""" + layer = Layer(name=self.output_layer, + text_object=None, + attributes=self.output_attributes, + parent=self.input_layers[1], + ambiguous=False ) + if self.add_parent_and_children: + layer.serialisation_module = syntax_v0.__version__ + return layer + + def _make_layer(self, text, layers, status): + # Make an internal import to avoid explicit stanza dependency + import numpy as np + from stanza import Document + from stanza.models.common.chuliu_edmonds import chuliu_edmonds_one_root + + sentences_layer = self.input_layers[0] + morph_layer = self.input_layers[1] + + sents_lases_table = defaultdict(dict) + parsed_texts = defaultdict() + + text_data = prepare_input_doc(layers, sentences_layer, morph_layer, remove_fields=self.remove_fields, + replace_fields=self.replace_fields, random_picker=self._random1) + + if self.use_gpu and self.gpu_max_words_in_sentence is not None: + # Check that sentences are not too long (for CUDA memory) + for sentence in text_data: + if len(sentence) > self.gpu_max_words_in_sentence: + raise Exception( ('(!) Encountered a sentence which length ({}) exceeds '+\ + 'gpu_max_words_in_sentence ({}). Are you sure GPU '+\ + 'has enough memory for processing this long sentence? '+\ + 'Either process this document with CPU or, if GPU '+\ + 'memory is ensured, pass parameter '+\ + 'gpu_max_words_in_sentence=None to this tagger '+\ + 'to disable this exception.').format(len(sentence), \ + self.gpu_max_words_in_sentence) ) + + for model, nlp in self.taggers.items(): + doc = Document(text_data) + nlp(doc) # Parsing documents + parsed_texts[model] = doc.to_dict() + + parent_layer = layers[self.input_layers[1]] + # Find predictions' uncertainty + words_entropy = None + if self.find_entropy: + words_entropy = \ + find_prediction_entropy(parent_layer, parsed_texts, add_model_votes=True) + assert len(words_entropy) == len(parent_layer) + extracted_words = [] + if self.aggregation_algorithm == 'las_coherence': + # 1) Compare predictions of each model against every other + # model, and find the prediction with the highest avg sentence + # LAS score. This method ensures valid tree structure. + extracted_data = [] + for model, parsed in parsed_texts.items(): + for model2, parsed2 in parsed_texts.items(): + text_sentence_lases = text_sentence_LAS(parsed, parsed2) + sents_lases_table[model][model2] = text_sentence_lases + + final_table = defaultdict(dict) # Scores by sentence indices. + for idx in range(len(layers[sentences_layer])): + for model, scores in sents_lases_table.items(): + if model not in final_table: + final_table[idx][model] = {} + for model2, sentence_scores in scores.items(): + score = sentence_scores[idx] + final_table[idx][model][model2] = score + + sent_scores = defaultdict(dict) + getcontext().prec = 4 + for sent, score_dict in final_table.items(): + if sent not in sent_scores: + sent_scores[sent] = Counter() + for base_model, score in score_dict.items(): + decimals = list(map(Decimal, score.values())) + avg_score = sum(decimals) / Decimal(len(self.taggers)) + sent_scores[sent][base_model] = avg_score + + chosen_sents = defaultdict(list) + for sent, score in sent_scores.items(): + max_score = max(score.values()) + max_score_count = 0 + max_score_models = [] + for s in score: + if score[s] == max_score: + max_score_count += 1 + max_score_models.append(s) + self._random2.shuffle(max_score_models) + chosen_sents[max_score_models[0]].append(sent) + + idxed_sents = {} + for model, sent_no in chosen_sents.items(): + content = parsed_texts[model] + sents_set = set(sent_no) + for idx in sents_set: + idxed_sents[idx] = content[idx] + + for idx in range(0, len(idxed_sents)): + extracted_data.append(idxed_sents[idx]) + + extracted_words = [word for sentence in extracted_data for word in sentence] + else: + assert self.aggregation_algorithm == "majority_voting" + # 2) Majority voting: pick the dependency relation with the + # highest number of votes over all model predictions. + word_id = 0 + sentence_id = 0 + while sentence_id < len(layers[sentences_layer]): + # 1) Collect words and votes for the current sentence + sentence_word_id = 0 + sent_len = len(layers[sentences_layer][sentence_id]) + voting_table = defaultdict(lambda: defaultdict(int)) + label_token_map = defaultdict(lambda: defaultdict(list)) + sent_matrix = np.zeros((sent_len+1, sent_len+1)) + np.fill_diagonal(sent_matrix, -float('inf')) + while sentence_word_id < sent_len: + # collect votes + for model, parsed_doc in parsed_texts.items(): + assert len(parsed_doc) == len(layers[sentences_layer]) + sentence = parsed_doc[sentence_id] + assert len(sentence) == sent_len + token = sentence[sentence_word_id] + label = '{}__{}'.format(token['deprel'], token['head']) + voting_table[sentence_word_id][label] += 1 + label_token_map[sentence_word_id][label].append(token) + head_int = int(token['head']) + sent_matrix[sentence_word_id+1, head_int] += 1.0 + sentence_word_id += 1 + word_id += 1 + # 2) use Chu–Liu/Edmonds' algorithm to find head_seq of a valid tree + valid_tree_head_seq = chuliu_edmonds_one_root(sent_matrix)[1:] + # 3) For each word, find maximum voting score and corresponding tokens + for wid in sorted(voting_table.keys()): + valid_head = valid_tree_head_seq[wid] + max_votes_valid = [] + for l, v in voting_table[wid].items(): + if l.endswith('__{}'.format(valid_head)): + max_votes_valid.append((l, v)) + if not max_votes_valid: + # If something went wrong, then fall back to unchecked tree. + word_str = layers[sentences_layer][sentence_id][wid] + sentence_str = layers[sentences_layer][sentence_id].enclosing_text + msg = ('(!) Unable to find a tree-bound head for '+\ + 'word {!r} in sentence {!r}. ').format(word_str, sentence_str) + msg += 'Falling back to unchecked tree construction, ' + msg += 'which may result in an invalid syntax tree.' + warnings.warn(msg) + max_votes_valid = voting_table[wid].items() + max_votes = max([v for (l, v) in max_votes_valid]) + max_votes_labels = [l for l, v in max_votes_valid if v==max_votes] + max_votes_tokens = [] + for label, tokens in label_token_map[wid].items(): + if label in max_votes_labels: + max_votes_tokens.extend(tokens) + # In case of a tie, pick a token randomly + self._random2.shuffle(max_votes_tokens) + extracted_words.append(max_votes_tokens[0]) + # Next sentence + sentence_id += 1 + assert len(extracted_words) == len(parent_layer) + + layer = self._make_layer_template() + layer.text_object=text + + global_word_id = 0 + for token, span in zip(extracted_words, parent_layer): + assert span.text == token['text'] + word_id = token['id'] + lemma = token['lemma'] + upostag = token['upos'] + xpostag = token['xpos'] + feats = OrderedDict() + if 'feats' in token.keys(): + feats = feats_to_ordereddict(token['feats']) + head = token['head'] + deprel = token['deprel'] + + attributes = {'id': word_id, 'lemma': lemma, 'upostag': upostag, 'xpostag': xpostag, + 'feats': feats, 'head': head, 'deprel': deprel, 'deps': '_', 'misc': '_'} + if words_entropy is not None: + w_entropy = words_entropy[global_word_id] + attributes['entropy'] = w_entropy['entropy'] + attributes['votes'] = w_entropy['votes'] + + layer.add_annotation(span, **attributes) + global_word_id += 1 + + if self.add_parent_and_children: + # Add 'parent_span' & 'children' to the syntax layer. + self.syntax_dependency_retagger.change_layer(text, {self.output_layer: layer}) + + if self.mark_syntax_error: + # Add 'syntax_error' & 'error_message' to the layer. + self.ud_validation_retagger.change_layer(text, {self.output_layer: layer}) + + if self.mark_agreement_error: + # Add 'agreement_deprel' to the layer. + self.agreement_error_retagger.change_layer(text, {self.output_layer: layer}) + + return layer + + +def find_prediction_entropy(words_layer, parsed_texts, add_model_votes=True, add_sentence_ids=False): + ''' + Calculates uncertainty/Shannon entropy for ensemble predictions of each word. + If add_model_votes, then adds frequencies of model votes to results (for debugging). + If add_sentence_ids, then adds sentence id-s to results. + ''' + word_id = 0 + sentence_id = 0 + sentence_word_id = 0 + results = [] + while word_id < len( words_layer ): + sentence = None + # Get (deprel,head) votes for the word token + votes = [] + for model, parsed_doc in parsed_texts.items(): + sentence = parsed_doc[sentence_id] + token = sentence[sentence_word_id] + label = '{}_{}'.format(token['deprel'], token['head']) + votes.append(label) + voting_table = Counter(votes) + normalized_voting_table = \ + [voting_table[k]/len(parsed_texts.items()) for k in voting_table.keys()] + e = entropy(normalized_voting_table) + r = {'entropy': e} + if add_model_votes: + r['votes'] = voting_table.most_common() + if add_sentence_ids: + r['sentence_id'] = sentence_id + results.append(r) + word_id += 1 + sentence_word_id += 1 + if sentence_word_id >= len(sentence): + # Next sentence + sentence_id += 1 + sentence_word_id = 0 + return results + + +def sentence_LAS(sent1, sent2): + wrong = 0 + correct = 0 + for tok1, tok2 in zip(sent1, sent2): + if tok1['xpos'] != 'Z': + if tok1['head'] == tok2['head'] and tok1['deprel'] == tok2['deprel']: + correct += 1 + else: + wrong += 1 + + if wrong == 0 and correct == 0: + return 1 + else: + return correct / (correct + wrong) + + +def text_sentence_LAS(sents1, sents2): + file_sentence_lases = [] + for sent1, sent2 in zip(sents1, sents2): + las = sentence_LAS(sent1, sent2) + file_sentence_lases.append(las) + return file_sentence_lases + diff --git a/py39_stanza_training_env.yml b/py39_stanza_training_env.yml new file mode 100644 index 00000000..41045a5b --- /dev/null +++ b/py39_stanza_training_env.yml @@ -0,0 +1,123 @@ +name: py39_stanza_training +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - ca-certificates=2022.10.11=h06a4308_0 + - certifi=2022.12.7=py39h06a4308_0 + - ld_impl_linux-64=2.38=h1181459_1 + - libffi=3.4.2=h6a678d5_6 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - ncurses=6.3=h5eee18b_3 + - openssl=1.1.1s=h7f8727e_0 + - pip=22.3.1=py39h06a4308_0 + - python=3.9.15=h7a1cb2a_2 + - readline=8.2=h5eee18b_0 + - setuptools=65.6.3=py39h06a4308_0 + - sqlite=3.40.1=h5082296_0 + - tk=8.6.12=h1ccaba5_0 + - tzdata=2022g=h04d1e81_0 + - wheel=0.37.1=pyhd3eb1b0_0 + - xz=5.2.8=h5eee18b_0 + - zlib=1.2.13=h5eee18b_0 + - pip: + - absl-py==1.4.0 + - asttokens==2.2.1 + - astunparse==1.6.3 + - backcall==0.2.0 + - beautifulsoup4==4.11.1 + - bs4==0.0.1 + - cached-property==1.5.2 + - cachetools==5.2.1 + - charset-normalizer==2.1.1 + - click==8.1.3 + - conllu==4.5.2 + - contourpy==1.0.6 + - cycler==0.11.0 + - decorator==5.1.1 + - emoji==2.2.0 + - estnltk==1.7.2 + - estnltk-core==1.7.2 + - estnltk-neural==1.7.2 + - executing==1.2.0 + - filelock==3.9.0 + - flatbuffers==23.1.4 + - fonttools==4.38.0 + - gast==0.4.0 + - google-auth==2.16.0 + - google-auth-oauthlib==0.4.6 + - google-pasta==0.2.0 + - grpcio==1.51.1 + - h5py==3.7.0 + - html5lib==1.1 + - huggingface-hub==0.11.1 + - idna==3.4 + - importlib-metadata==6.0.0 + - ipython==8.8.0 + - jedi==0.18.2 + - joblib==1.2.0 + - keras==2.11.0 + - kiwisolver==1.4.4 + - libclang==15.0.6.1 + - lxml==4.9.2 + - markdown==3.4.1 + - markupsafe==2.1.2 + - matplotlib==3.6.2 + - matplotlib-inline==0.1.6 + - networkx==3.0 + - nltk==3.8.1 + - numpy==1.24.1 + - oauthlib==3.2.2 + - opt-einsum==3.3.0 + - packaging==23.0 + - pandas==1.5.2 + - parso==0.8.3 + - pexpect==4.8.0 + - pickleshare==0.7.5 + - pillow==9.4.0 + - prompt-toolkit==3.0.36 + - protobuf==3.19.6 + - ptyprocess==0.7.0 + - pure-eval==0.2.2 + - pyahocorasick==1.4.4 + - pyasn1==0.4.8 + - pyasn1-modules==0.2.8 + - pygments==2.14.0 + - pyparsing==3.0.9 + - python-crfsuite==0.9.8 + - python-dateutil==2.8.2 + - pytz==2022.7 + - pyyaml==6.0 + - regex==2022.10.31 + - requests==2.28.1 + - requests-oauthlib==1.3.1 + - rsa==4.9 + - six==1.16.0 + - soupsieve==2.3.2.post1 + - stack-data==0.6.2 + - stanza==1.4.2 + - tensorboard==2.11.2 + - tensorboard-data-server==0.6.1 + - tensorboard-plugin-wit==1.8.1 + - tensorflow==2.11.0 + - tensorflow-estimator==2.11.0 + - tensorflow-io-gcs-filesystem==0.29.0 + - termcolor==2.2.0 + - tokenizers==0.13.2 + - torch==1.13.1+cu116 + - torchaudio==0.13.1+cu116 + - torchvision==0.14.1+cu116 + - tqdm==4.64.1 + - traitlets==5.8.1 + - transformers==4.25.1 + - typing-extensions==4.4.0 + - urllib3==1.26.14 + - wcwidth==0.2.5 + - webencodings==0.5.1 + - werkzeug==2.2.2 + - wrapt==1.14.1 + - zipp==3.11.0 +prefix: /home/soras/.conda/envs/py39_stanza_training diff --git a/readme.md b/readme.md new file mode 100644 index 00000000..0347d82e --- /dev/null +++ b/readme.md @@ -0,0 +1,97 @@ +## Syntax ablation experiments + +This repository contains codebase and results of the experiments reported in the article ["Automatic dependency parsing of Estonian: what linguistic features to include?"](https://doi.org/10.1007/s10579-024-09779-z). + +### Pre-requisites + +* Install [estnltk](https://github.com/estnltk/estnltk) (version 1.7.2+ is required); +* Install [stanza](https://github.com/stanfordnlp/stanza) (we used version 1.4.2); +* Install [scikit-learn](https://scikit-learn.org/) (we used version 1.2.1); +* For some of the experiments, you'll also need [MaltParser](https://maltparser.org), [MaltOptimizer](http://nil.fdi.ucm.es/maltoptimizer), [UDPipe-1](https://ufal.mff.cuni.cz/udpipe/1) and [gensim](https://radimrehurek.com/gensim); +* For visualization of the results, you'll need [matplotlib](https://matplotlib.org/stable/), [seaborn](https://seaborn.pydata.org/), [plotnine](https://plotnine.readthedocs.io/en/stable/), [patchworklib](https://pypi.org/project/patchworklib/), [adjustText](https://pypi.org/project/adjustText/); +* Download and unpack [Estonian UD corpus](https://github.com/UniversalDependencies/UD_Estonian-EDT/tags) (most experiments were conducted with the corpus version 2.6, the version 2.11 was used for one experiment); + +### Configuration files + +Most important settings of data pre-processing, training and evaluation are defined in configuration INI files. You can find these files in [confs](confs/) folder. In order to run a processing step, pass name of an INI file as an argument to the script. + +### Processing steps (scripts) + +* `01_ud_preprocessing.py` -- Converts gold standard UD corpus to EstNLTK's format: overwrites values of `lemma`, `upos`, `xpos` and `feats` with EstNLTK's automatic morphological analyses (from layers `morph_analysis` / `morph_extended` / `ud_morph_analysis`). Alternatively, you can also skip the conversion altogether and just clean the gold standard files and copy to the experiments folder. Executes all sections starting with `preannotation_` and `copy_` in input configuration file. Example usage: + + * `python 01_ud_preprocessing.py confs/conf_edt_v26_Stanza_ME_full.ini` + +* `01b_extract_clauses.py` -- Splits sentences in CONLLU files into clauses (with EstNLTK's ClauseTagger). Cleans clauses (removes conjunctions and punctuation at the beginning and/or at the end of a clause), and exports cleaned clauses as CONLLU files. Executes all sections starting with `extract_clauses_` in input configuration file. This is a preprocessing step required by _syntax sketches knockout experiments_. Example usage: + + * `python 01b_extract_clauses.py confs/conf_edt_v26_Stanza_ME_sketches_knockout_5_groups.ini` + +* `01c_analyse_sketches.ipynb` -- computes sketches from the whole corpus and provides (descriptive) data analysis of sketches. Optional step in _syntax sketches knockout experiments_. + +* `01d_prepare_sketches.py` -- Creates frequency table of syntax sketches, and prepares datasets for sketches knockout experiments: removes clauses corresponding to sketches systematically from train, dev and test sets. Executes sections in the configuration starting with prefix `make_sketches_table_` and `prepare_knockout_`. This is a preprocessing step required by _syntax sketches knockout experiments_. Example usage: + + * `python 01d_prepare_sketches.py confs/conf_edt_v26_Stanza_ME_sketches_knockout_5_groups.ini` + +* `02_split_data.py` -- Creates data splits (or joins) for model training and evaluation. Executes all sections starting with `split_` and `join_` in input configuration file. Example usage: + + * `python 02_split_data.py confs/conf_edt_v26_Stanza_ME_full.ini` + +* `02b_make_gaps.py` -- Modifies conllu files for gap experiments: deletes a combination of fields `form`, `lemma`, `upos`, `xpos`, `feats` from files, and writes files with deletions to a new location. For implemented modifications, see the header comment of the script. Executes all sections starting with `modify_conllu_` in input configuration file. Example usage: + + * `python 02b_make_gaps.py confs/conf_edt_v26_Stanza_ME_gap_experiments.ini` + +* `03_predict_stanza_morph.py` -- Uses stanza's POS/morphological features tagger and lemmatizer models for predicting morphological annotations. Executes all sections starting with `predict_morph_feats_stanza_` in input configuration file. Example: + + * `python 03_predict_stanza_morph.py confs/conf_edt_v26_Stanza_UD_morph_Stanza_UD_syntax.ini` + +* `03_train_stanza.py` -- Trains stanza parser models. Executes all sections starting with `train_stanza_` in input configuration file. Example: + + * `python 03_train_stanza.py confs/conf_edt_v26_Stanza_ME_full.ini` + +* `03b_optimize_malt.py` -- Optimizes MaltParser before training: produces feature selection files. Requires Python 2.7. Executes all sections starting with `maltoptimize_` in input configuration file. Example: + + * `python 03b_optimize_malt.py confs/conf_edt_v26_MaltParser_ME_full.ini` + +* `03c_train_malt_udpipe.py` -- Trains MaltParser and/or UDPipe-1 models. Executes all sections starting with `train_malt_` and `train_udpipe1_` in input configuration file. Example: + + * `python 03c_train_malt_udpipe.py confs/conf_edt_v26_MaltParser_ME_full.ini` + +* `04_predict_stanza.py` -- Applies trained stanza parser models on evaluation data to get predictions. Writes predictions to conllu files. Executes all sections starting with `predict_stanza_` in input configuration file. Example: + + * `python 04_predict_stanza.py confs/conf_edt_v26_Stanza_ME_full.ini` + +* `04b_predict_malt_udpipe.py` -- Applies trained MaltParser and/or UDPipe-1 models on evaluation data to get predictions. Writes predictions to conllu files. Executes all sections starting with `predict_malt_` and `predict_udpipe1_` in input configuration file. Example: + + * `python 04b_predict_malt_udpipe.py confs/conf_edt_v26_MaltParser_ME_full.ini` + +* `05_evaluate.py` -- Evaluates predictions: compares predicted files to gold standard files and calculates LAS/UAS scores. Executes all sections starting with `eval_` in given configuration files (multiple INI files can be given as an input). Writes results into file `results.csv` in a sub directory closest to the execution directory (for given configurations, the path will be: `edt_2.6/results.csv`). You can also give name of the output csv file as an input argument of the script. Example usage: + + * `python 05_evaluate.py confs/conf_edt_v26_MaltParser_ME_full.ini results_maltparser.csv` + +Note: configurations also contain overlapping parts, e.g. once you've run UD preprocessing with `confs/conf_edt_v26_Stanza_ME_full.ini`, you do not need to run UD preprocessing again with `confs/conf_edt_v26_stanza_ME_ensemble_full.ini`; + +### Results and further studies + + +* [06_result_tables.ipynb](06_result_tables.ipynb) -- tables with the experiment results read from CSV files; + +* [07_smaller_data_exp_and_extrapolation.ipynb](07_smaller_data_exp_and_extrapolation.ipynb) -- draw figures about smaller data experiments (experiments where training set size is gradually increased) and extrapolate the results; + +* [08_results_clauses_and_sketches_knockout_5groups.ipynb](08_results_clauses_and_sketches_knockout_5groups.ipynb) -- results of the _clauses experiment_ and _syntax sketches knockout experiments_; + +### Extras + +* [X1_previously_reported_performances.ipynb](X1_previously_reported_performances.ipynb) -- a small recap of previously reported dependency parsing performances for Estonian and English; + +### Citation + +In case you use this codebase or results in your work, please cite us as follows: + + @article{laur2024automatic, + title={Automatic dependency parsing of Estonian: what linguistic features to include?}, + author={Laur, Sven and Orasmaa, Siim and Eiche, Sandra and Särg, Dage}, + journal={Language Resources and Evaluation}, + year={2024}, + publisher={Springer}, + url={https://doi.org/10.1007/s10579-024-09779-z}, + doi={10.1007/s10579-024-09779-z}, + } \ No newline at end of file diff --git a/syntax_sketches/__init__.py b/syntax_sketches/__init__.py new file mode 100644 index 00000000..5fec1e74 --- /dev/null +++ b/syntax_sketches/__init__.py @@ -0,0 +1,3 @@ +from .syntax_sketch import clean_clause +from .syntax_sketch import syntax_sketch +from .clause_export import export_cleaned_clause diff --git a/syntax_sketches/clause_export.py b/syntax_sketches/clause_export.py new file mode 100644 index 00000000..fac2db62 --- /dev/null +++ b/syntax_sketches/clause_export.py @@ -0,0 +1,83 @@ +from typing import Dict, List + + +def export_cleaned_clause(clause: Dict[str, list]) -> str: + """ + Exports a cleaned clause into the CoNLL-U format. This can be used to create various data sets. + The function assumes that the input is generated by the function clean_clause and that there is only one root node. + Returns a properly formatted table where each token is on the separate line and upos and xpos fields are the same. + """ + + assert len(clause['root_loc']) == 1, 'There can be only one root in a clause' + assert len(clause['ids']) == len(clause['wordforms']) == len(clause['lemmas']), 'Fields must be aligned' + assert len(clause['ids']) == len(clause['postags']) == len(clause['features']), 'Fields must be aligned' + assert len(clause['ids']) == len(clause['deprels']) == len(clause['heads']), 'Fields must be aligned' + + # As standard ids will be assigned to all tokens in a clause we need update heads + head_map = {clause_id: idx + 1 for idx, clause_id in enumerate(clause['ids'])} + + result = [] + row_template = '{id}\t{wordform}\t{lemma}\t{postag}\t{postag}\t{features}\t{head}\t{deprel}\t_\t_' + for i in range(len(clause['ids'])): + + head = head_map.get(clause['heads'][i], 0) + deprel = clause['deprels'][i] if head != 0 else 'root' + + if clause['features'][i]: + features = '|'.join([key + '=' + value for key, value in clause['features'][i].items()]) + else: + features = '_' + + result.append(row_template.format( + id=i + 1, + wordform=clause['wordforms'][i], + lemma=clause['lemmas'][i], + postag=clause['postags'][i], + features=features, + head=head, + deprel=deprel)) + + return '\n'.join(result) + + + +def remove_extracted_from_conllu_and_dicts(overall_conllu: List[str], + overall_dicts: List[Dict[str, list]], + extracted_conllu: List[str]): + ''' + Removes items of extracted_conllu from overall_conllu (and from corresponding + overall_dicts). + + Assumes that overall_conllu and extracted_conllu are lists of CoNLL-U format + clause strings, and that extracted_conllu is a sub list of overall_conllu. + overall_dicts should be a list of dictionaries with CoNLL features, each dict + corresponding to a clause with the same index in overall_conllu. + + Note that there can be repeating clauses in overall_conllu, e.g. clause "ma ei tea" + ('I don't know') may appear multiple times. If a repeating clause appears in + extracted_conllu, all of its instances will be deleted from overall_conllu and + overall_dicts. + This function keeps track of how many times each clause (conllu) was deleted and + returns a dictionary mapping extracted clause conllu-s to corresponding deletion + counts (for debugging purposes). + + Returns (new_clause_conllu, new_clause_dicts, deletion_counts) + ''' + assert len(overall_conllu) == len(overall_dicts) + new_clause_conllu = [] + new_clause_dicts = [] + deletion_counts = dict() + extracted_counts = \ + {conllu: extracted_conllu.count(conllu) for conllu in extracted_conllu} + for cid, conllu in enumerate(overall_conllu): + if conllu not in extracted_conllu: + # Preserve clause + new_clause_conllu.append(conllu) + new_clause_dicts.append(overall_dicts[cid]) + else: + # Delete clause + # Keep track of how many times clause was deleted + if conllu not in deletion_counts: + deletion_counts[conllu] = 0 + deletion_counts[conllu] += 1 + return new_clause_conllu, new_clause_dicts, deletion_counts \ No newline at end of file diff --git a/syntax_sketches/clause_import.py b/syntax_sketches/clause_import.py new file mode 100644 index 00000000..d37e9680 --- /dev/null +++ b/syntax_sketches/clause_import.py @@ -0,0 +1,98 @@ +import os, os.path +from typing import List + +# Mapping from conllu field name to key name in output dictionary +conllu2dictkey = \ + [('id', 'ids'), + ('wordform', 'wordforms'), + ('lemma', 'lemmas'), + ('upos', None), + ('xpos', 'postags'), + ('features', 'features'), + ('head', 'heads'), + ('deprel', 'deprels'), + ('deps', None), + ('misc', None)] + + +def import_clauses(input_file: str, as_dicts:bool=False) -> List[str]: + """ + Imports all clauses from a CoNLL-U format file. + + By default, returns a list of CoNLL-U strings, each corresponding + to a clause. If as_dicts==True, then returns a list of dicts, + each dict containing CoNLL features extracted from a single clause + (dict keys: ids, postags, deprels, heads, root_loc, wordforms, + lemmas, features). + + CoNLL-U strings output format is used in filtering/removing clauses, + dicts output format is a basis for creating syntax sketches for + clauses. + + The function assumes that clauses in the input file have been + generated by the function export_cleaned_clause. + """ + assert os.path.exists( input_file ), f'Non-existent input conllu file: {input_file}' + clause_dicts = [] + clause_strings = [] + with open(input_file, 'r', encoding='utf-8') as in_f: + clause_started = False + current_clause = [] + current_clause_dict = dict(ids=[], postags=[], deprels=[], + heads=[], root_loc=[], wordforms=[], + lemmas=[], features=[]) + for line in in_f: + if len(line.rstrip()) > 0: + if line[0].isnumeric(): + number_of_fields = len(line.split('\t')) + assert number_of_fields == len(conllu2dictkey), \ + f'Unexpected number of fields ({number_of_fields}) on conllu line: {line!r}' + # line corresponds to a word's analysis + if not as_dicts: + # extract only CoNLL line + current_clause.append( line.rstrip() ) + else: + # extract CoNLL field values to dict + parts = line.split('\t') + for field_id, (conllu_field, dict_key) in enumerate(conllu2dictkey): + if dict_key is not None: + value = parts[field_id] + current_clause_dict[dict_key].append(value) + clause_started = True + else: + if not line.startswith('#'): + raise ValueError(f'Unexpected conllu line: {line!r}') + else: + if clause_started: + # end of a clause + if not as_dicts: + # Save clause CoNLL lines + clause_strings.append('\n'.join(current_clause)) + current_clause = [] + else: + # Find indices of root nodes + ids = current_clause_dict['ids'] + heads = current_clause_dict['heads'] + root_locations = [i for i, head in enumerate(heads) if head not in ids] + current_clause_dict['root_loc'] = root_locations + # Save clause dict + clause_dicts.append(current_clause_dict) + current_clause_dict = dict(ids=[], postags=[], deprels=[], + heads=[], root_loc=[], wordforms=[], + lemmas=[], features=[]) + clause_started = False + # add the last remaining clause + if clause_started: + if not as_dicts: + # Save clause CoNLL lines + clause_strings.append('\n'.join(current_clause)) + else: + # Find indices of root nodes + ids = current_clause_dict['ids'] + heads = current_clause_dict['heads'] + root_locations = [i for i, head in enumerate(heads) if head not in ids] + current_clause_dict['root_loc'] = root_locations + # Save clause dict + clause_dicts.append(current_clause_dict) + return clause_strings if not as_dicts else clause_dicts + diff --git a/syntax_sketches/deprel_seq.py b/syntax_sketches/deprel_seq.py new file mode 100644 index 00000000..f558a2e3 --- /dev/null +++ b/syntax_sketches/deprel_seq.py @@ -0,0 +1,20 @@ +import conllu + +def collect_deprel_seqs(in_file: str): + ''' + Reads sentences from given conllu file (`in_file`) and + collects all `deprel` sequences corresponding to sentences. + Returns a list of sentences, each sentence represented as + a `deprel` sequence string, where word `deprels` are joined + via `|`. + This function is used only for data exploration. + ''' + with open(in_file, 'r', encoding='utf-8') as input_file: + conll_sentences = conllu.parse(input_file.read()) + all_deprel_seqs = [] + for sentence in conll_sentences: + deprel_seq = [] + for token in sentence: + deprel_seq.append(token["deprel"]) + all_deprel_seqs.append('|'.join(deprel_seq)) + return all_deprel_seqs \ No newline at end of file diff --git a/syntax_sketches/syntax_sketch.py b/syntax_sketches/syntax_sketch.py new file mode 100644 index 00000000..c4784752 --- /dev/null +++ b/syntax_sketches/syntax_sketch.py @@ -0,0 +1,344 @@ +import os, os.path +from random import Random + +from estnltk import Layer +from estnltk.converters.conll.conll_importer import conll_to_text + +from typing import List, Dict, Union, Any, Tuple, Optional + +# ===================================================== +# Creating syntax sketches +# ===================================================== + +def subtree_size(heads: List[int], tails: List[int], root: int) -> int: + """ + Computes the size of the subtree specified by the root node, i.e., the root is included into the subtree. + Arcs of a tree are specified as head, tail pairs, i.e., tails[i] -> heads[i] is an arc. + """ + + result = 0 + for i, dep_head in enumerate(heads): + if dep_head == root: + result += subtree_size(heads, tails, tails[i]) + return result + 1 + + +def clean_clause(clause: Layer) -> Dict[str, list]: + """ + Removes spurious words from clause and extracts relevant information from other layers. + Spurious words can occur at the beginning or at the end of the clause: + * conjunctions + * punctuation marks + + Returns a dictionary of aligned vectors for clause members: + * ids -- token numbers + * postags -- part-of-speech tags + * deprels -- dependency relations + * heads -- head of the node + * root_loc -- indices of root nodes + * wordforms -- complete text + * lemmas -- lemma + * features -- other syntactic features + + Syntax information is specified as in the syntax tree corresponding to the entire sentence. + As clause finding algorithm is not perfect there can be several roots in the clause. + The information about root can be found by fetching the corresponding field, e.g. ids[root_loc[0]]. + These fields contain enough information to store the cleaned clause in the conll-format + """ + + # Extract relevant fields + ids = list(clause.ud_syntax.id) + postags = list(clause.ud_syntax.xpostag) + deprels = list(clause.ud_syntax.deprel) + heads = list(clause.ud_syntax.head) + + wordforms = list(clause.ud_syntax.text) + lemmas = list(clause.ud_syntax.lemma) + features = list(clause.ud_syntax.feats) + + # Remove leading punctuation marks and conjunction + while postags and ('J' in postags[0] or 'Z' in postags[0]): + heads.pop(0) + ids.pop(0) + deprels.pop(0) + postags.pop(0) + wordforms.pop(0) + lemmas.pop(0) + features.pop(0) + + if not postags: + return dict(ids=[], postags=[], deprels=[], heads=[], root_loc=[], wordforms=[], lemmas=[], features=[]) + + # Remove trailing punctuation marks and conjunction + while 'J' in postags[-1] or 'Z' in postags[-1]: + heads.pop() + ids.pop() + deprels.pop() + postags.pop() + wordforms.pop() + lemmas.pop() + features.pop() + + # Find indices of root nodes + root_locations = [i for i, head in enumerate(heads) if head not in ids] + + return dict( + ids=ids, postags=postags, deprels=deprels, heads=heads, + root_loc=root_locations, + wordforms=wordforms, lemmas=lemmas, features=features) + + +def syntax_sketch(clause: Dict[str, list], ordered=True) -> str: + """ + Computes syntax sketch for a clause that encodes information about the root node and the first level child nodes. + By default the first level child nodes are lexicographically ordered in the sketch. + + Examples: + + wordforms: ['Ma', 'kaldun', 'arvama'] + ids: [1, 2, 3] + heads: [2, 0, 2] + postags: ['P', 'V', 'V'] + deprels: ['nsubj', 'root', 'xcomp'] + root_loc: [1] + output: '[V]nsubj(L)xcomp(L)' + + wordforms: ['Vermeeri', 'saatus', 'oli', 'teistsugune'] + ids: [6, 7, 8, 9] + heads: [7, 9, 9, 3] + postags: ['S', 'S', 'V', 'P'] + deprels: ['nmod', 'nsubj:cop', 'cop', 'ccomp'] + root_loc: [3] + output: '[S]cop(L)nsubj:cop(L)' + + wordforms: ['uus', 'ooper', 'tuleb', 'habras', 'ja', 'ilus'] + ids: [8, 9, 10, 11, 12, 13] + heads: [9, 10, 2, 10, 13, 11] + postags: ['A', 'S', 'V', 'A', 'J', 'A'] + deprels: ['amod', 'nsubj', 'ccomp', 'xcomp', 'cc', 'conj'] + root_loc: [2] + output: '[V]nsubj(L)xcomp(P)' + """ + + assert len(clause['root_loc']) == 1, "The clause must have a single root" + + # Compute root tag for the sketch + root_tag = clause['postags'][clause['root_loc'][0]] + if root_tag == 'V': + # group of verbs + sketch_root = 'V' + elif root_tag in ['S', 'P', 'A', 'Y', 'N']: + # non-verbs: substantives, pronouns, adjectives, + # acronyms/abbreviations, numerals + sketch_root = 'S' + else: + # remaining postags + sketch_root = 'X' + + # Compute sketches for child nodes + first_level = list() + root = clause['ids'][clause['root_loc'][0]] + for i, head in enumerate(clause['heads']): + if head != root: + continue + + length = subtree_size(clause['heads'], clause['ids'], clause['ids'][i]) + if length < 3: + subtree_cat = 'L' + elif length < 10: + subtree_cat = 'P' + else: + subtree_cat = 'ÜP' + + subtree = clause['deprels'][i] + '({})'.format(subtree_cat) + first_level.append(subtree) + + if ordered: + return '[{root}]{children}'.format(root=sketch_root, children=''.join(sorted(first_level))) + else: + return '[{root}]{children}'.format(root=sketch_root, children=''.join(first_level)) + + +def safe_sketch_name(sketch_name: str) -> str: + ''' + Makes sketch name safe so that it can be used as (a part of) file name. + Returns safe name. + ''' + safe_name = sketch_name.replace(':', 'COLON').replace(')', '').replace('(', '').replace('[', '').replace(']', '') + assert safe_name.isidentifier() + return safe_name + + +# ===================================================== +# Compute sketches for the whole corpus +# ===================================================== + +def compute_sketches(input_dir:str, skip_files:List[str]=['train_full.conllu'], verbose:bool=True) -> Tuple[List[str], int]: + ''' + Loads clauses from conllu files in the input_dir and computes syntax sketches + for all clauses that have a single root. + Assumes that all conllu files in the input_dir have been created via script + "01b_extract_clauses.py", that is, they contain clauses instead of sentences. + Optionally, you can skip some of the input files via parameter skip_files. + Returns tuple: (list_of_sketches, clauses_count_total) + ''' + # 1) Import data from conllu files, rename sentences -> clauses and validate + expected_layers = {'clauses', 'ud_syntax', 'words'} + whole_data = [] + for fname in os.listdir(input_dir): + if fname in skip_files: + continue + if fname.endswith('.conllu'): + text_obj = conll_to_text( os.path.join(input_dir, fname), + 'ud_syntax', + remove_empty_nodes=True) + text_obj.meta['file'] = fname + # Rename sentences layer + # (because it actually contains clauses, not sentences) + clauses_layer = text_obj.pop_layer('sentences') + clauses_layer.name = 'clauses' + text_obj.add_layer(clauses_layer) + # Validate text layers + assert text_obj.layers == expected_layers, \ + f'Unexpected layers {text_obj.layers!r}' + whole_data.append(text_obj) + # 2) Create sketches from the data + clauses_count_total = 0 + invalid_clauses_total = 0 + sketches = [] + for text_obj in whole_data: + clauses_count = 0 + for clause in text_obj.clauses: + cleaned_clause = clean_clause(clause) + if len(cleaned_clause['root_loc']) != 1: + # At this point, assuming input processed with + # "01b_extract_clauses.py", we actually should + # not encounter any invalid clauses ... + invalid_clauses_total += 1 + continue + sketches.append(syntax_sketch(cleaned_clause)) + clauses_count += 1 + if verbose: + print(text_obj.meta['file'], '|', f'#clauses: {clauses_count}') + clauses_count_total += clauses_count + if verbose: + print() + print(f'#clauses total: {clauses_count_total}') + if invalid_clauses_total > 0: + print(f'#invalid clauses total: {invalid_clauses_total}') + return sketches, clauses_count_total + + +# ===================================================== +# Distribute syntax sketches randomly into bins +# ===================================================== + +def rand_group_sketches(sketches: List[Union[str, List[Any]]], n:int, seed:int=5) \ + -> List[List[Union[str, List[Any]]]]: + ''' + Distributes given sketches randomly into n same size groups. + Returns list of lists of sketches, one sub list for each group. + ''' + result = [] + if not n <= len(sketches): + raise ValueError(f'(!) Number of sketches ({len(sketches)}) '+\ + f'is smaller than number of groups ({n}).') + rnd = Random(seed) + rnd.shuffle(sketches) + for i in range(n): + result.append([]) + for sid, sketch in enumerate(sketches): + result[sid % n].append(sketch) + assert len(sketches) == sum([len(g) for g in result]) + return result + + +# ===================================================== +# Filtering lists of clauses by sketches +# ===================================================== + +def extract_sketches(clause_conllus: List[str], clause_dicts: List[Dict[str, list]], + target_sketch:str, amount:Optional[int]=None, verbose:bool=False): + ''' + Extracts given amount of target_sketch from clause_conllus and clause_dicts. + Note that the extraction operation is virtual: the input lists clause_conllus + and clause_dicts are not affected. + Returns extracted items. + If amount is None (default), then extracts all clauses corresponding to the sketch. + Returns triple: (extracted_conllus, extracted_dicts, number_of_extracted_items) + ''' + assert len(clause_conllus) == len(clause_dicts), \ + 'Unexpectedly, numers of conllu clauses and corresponding clause dicts differ: '+\ + f' {len(clause_conllus)} vs {len(clause_dicts)}' + extracted = [] + extracted_dicts = [] + for clause_id, clause_conllu in enumerate(clause_conllus): + clause_dict = clause_dicts[clause_id] + sketch = syntax_sketch(clause_dict) + if sketch == target_sketch: + if amount is None or len(extracted) < amount: + extracted.append( clause_conllu ) + extracted_dicts.append( clause_dict ) + if verbose: + print('Extracted {} instances of sketch {}'. format(len(extracted), target_sketch)) + return extracted, extracted_dicts, len(extracted) + + +def remove_sketches(clause_conllus: List[str], clause_dicts: List[Dict[str, list]], + target_sketch:str, amount:Optional[int]=None, verbose:bool=False): + ''' + Removes given amount of target_sketch from clause_conllus and clause_dicts. + Note that the removal operation is virtual: the input lists clause_conllus and + clause_dicts are not affected. + Returns preserved items after removal (and count of removed items). + If amount is None (default), then removes all clauses corresponding to the sketch. + Returns triple: (preserved_conllus, preserved_dicts, number_of_removed_items) + ''' + assert len(clause_conllus) == len(clause_dicts), \ + 'Unexpectedly, numers of conllu clauses and corresponding clause dicts differ: '+\ + f' {len(clause_conllus)} vs {len(clause_dicts)}' + preserved = [] + preserved_dicts = [] + removed = 0 + for clause_id, clause_conllu in enumerate(clause_conllus): + clause_dict = clause_dicts[clause_id] + sketch = syntax_sketch(clause_dict) + if sketch == target_sketch: + if amount is None or removed < amount: + removed += 1 + continue + preserved.append( clause_conllu ) + preserved_dicts.append( clause_dict ) + if verbose: + print('Removed {} instances of sketch {}'. format(removed, target_sketch)) + return preserved, preserved_dicts, removed + + +def remove_sketches_group(clause_conllus: List[str], clause_dicts: List[Dict[str, list]], + target_sketches:List[str], verbose:bool=False): + ''' + Removes all target_sketches from clause_conllus and clause_dicts. + Note that the removal operation is virtual: the input lists clause_conllus and + clause_dicts are not affected. + Returns preserved items after the removal (and total count of removed items). + Returns triple: (preserved_conllus, preserved_dicts, number_of_removed_items) + ''' + assert len(clause_conllus) == len(clause_dicts), \ + 'Unexpectedly, numers of conllu clauses and corresponding clause dicts differ: '+\ + f' {len(clause_conllus)} vs {len(clause_dicts)}' + assert len(target_sketches) > 0, 'Unexpectedly, got an empty target_sketches list' + preserved = [] + preserved_dicts = [] + removed = 0 + target_sketches_set = set(target_sketches) + for clause_id, clause_conllu in enumerate(clause_conllus): + clause_dict = clause_dicts[clause_id] + sketch = syntax_sketch(clause_dict) + if sketch in target_sketches_set: + removed += 1 + continue + preserved.append( clause_conllu ) + preserved_dicts.append( clause_dict ) + if verbose: + print('Removed {} instances of sketches {}'. format(removed, target_sketches)) + return preserved, preserved_dicts, removed \ No newline at end of file