From 8b532ec4021864aba49858458a8f2ea92bbbbd17 Mon Sep 17 00:00:00 2001 From: Shannon Shen <22512825+lolipopshock@users.noreply.github.com> Date: Mon, 27 Jun 2022 16:38:53 -0700 Subject: [PATCH 1/2] rename sepration to separation --- scripts/push_model_to_hub.sh | 4 ++-- scripts/train_ivila.sh | 2 +- src/vila/dataset/preprocessors/config.py | 7 +++++-- .../dataset/preprocessors/layout_indicator.py | 14 +++++++------- tests/test_preprocessor.py | 15 ++++++--------- tools/arguments.py | 12 ++++++------ tools/push_model_to_hf_hub.py | 6 +++--- tools/train-ivila.py | 6 +++--- 8 files changed, 33 insertions(+), 33 deletions(-) diff --git a/scripts/push_model_to_hub.sh b/scripts/push_model_to_hub.sh index 22515e4..b8c0375 100644 --- a/scripts/push_model_to_hub.sh +++ b/scripts/push_model_to_hub.sh @@ -9,7 +9,7 @@ python push_model_to_hf_hub.py \ --repo-name ivila-block-layoutlm-finetuned-docbank \ --agg_level "block" \ --group_bbox_agg "first" \ - --added_special_sepration_token "[BLK]" + --added_special_separation_token "[BLK]" # DocBank HVILA Block Finetuned python push_model_to_hf_hub.py \ @@ -34,7 +34,7 @@ python push_model_to_hf_hub.py \ --repo-name ivila-block-layoutlm-finetuned-grotoap2 \ --agg_level "block" \ --group_bbox_agg "first" \ - --added_special_sepration_token "[BLK]" + --added_special_separation_token "[BLK]" # GROTOAP2 HVILA Block Finetuned python push_model_to_hf_hub.py \ diff --git a/scripts/train_ivila.sh b/scripts/train_ivila.sh index 256fda5..de13b07 100644 --- a/scripts/train_ivila.sh +++ b/scripts/train_ivila.sh @@ -71,6 +71,6 @@ python train-ivila.py \ --per_device_eval_batch_size 40 \ --warmup_steps 2000 \ --load_best_model_at_end \ - --added_special_sepration_token $used_token \ + --added_special_separation_token $used_token \ --agg_level $agg_level \ --fp16 \ No newline at end of file diff --git a/src/vila/dataset/preprocessors/config.py b/src/vila/dataset/preprocessors/config.py index 1628d7a..99329d7 100644 --- a/src/vila/dataset/preprocessors/config.py +++ b/src/vila/dataset/preprocessors/config.py @@ -13,7 +13,7 @@ class VILAPreprocessorConfig: agg_level: str = "row" #"block", "sentence" label_all_tokens: bool = False group_bbox_agg: str = "first" - added_special_sepration_token: str = "[BLK]" + added_special_separation_token: str = "[BLK]" def to_json(self, path: str): with open(path, "w") as fp: @@ -25,7 +25,10 @@ def from_pretrained(cls, model_path: str, **kwargs): config = AutoConfig.from_pretrained(model_path) if hasattr(config, "vila_preprocessor_config"): - data_json = config.vila_preprocessor_config + data_json = config.vila_preprocessor_config.copy() + if "added_special_sepration_token" in data_json: + data_json["added_special_separation_token"] = data_json.pop("added_special_sepration_token") + # Fix an old typo in the config data_json.update(kwargs) return cls(**data_json) # We store the vila-preprocessor configs inside diff --git a/src/vila/dataset/preprocessors/layout_indicator.py b/src/vila/dataset/preprocessors/layout_indicator.py index 4d6049d..9d1b44e 100644 --- a/src/vila/dataset/preprocessors/layout_indicator.py +++ b/src/vila/dataset/preprocessors/layout_indicator.py @@ -54,9 +54,9 @@ def __init__( super().__init__(tokenizer, config, text_column_name, label_column_name) - self.added_special_sepration_token = config.added_special_sepration_token - if self.added_special_sepration_token == "default": - self.added_special_sepration_token = tokenizer.special_tokens_map[ + self.added_special_separation_token = config.added_special_separation_token + if self.added_special_separation_token == "default": + self.added_special_separation_token = tokenizer.special_tokens_map[ "sep_token" ] @@ -127,7 +127,7 @@ def preprocess_sample(self, example: Dict, padding="max_length") -> Dict: self.special_tokens_map[ self.tokenizer.special_tokens_map["sep_token"] ], - self.special_tokens_map[self.added_special_sepration_token], + self.special_tokens_map[self.added_special_separation_token], ]: # Because we could possibly insert [SEP] or [BLK] tokens in # this process. @@ -180,7 +180,7 @@ def insert_layout_indicator(self, example: Dict) -> Tuple[Dict, Dict]: ) processed_words.extend( words[pre_index : pre_index + cur_len] - + [self.added_special_sepration_token] + + [self.added_special_separation_token] ) processed_bbox.extend( bbox[pre_index : pre_index + cur_len] @@ -226,7 +226,7 @@ def insert_layout_indicator(self, example: Dict) -> Tuple[Dict, Dict]: ) processed_words.extend( words[pre_index : pre_index + cur_len] - + [self.added_special_sepration_token] + + [self.added_special_separation_token] ) processed_bbox.extend( bbox[pre_index : pre_index + cur_len] @@ -271,7 +271,7 @@ def insert_layout_indicator(self, example: Dict) -> Tuple[Dict, Dict]: range(new_sequence_len, new_sequence_len + end - start) ) processed_words.extend( - words[start:end] + [self.added_special_sepration_token] + words[start:end] + [self.added_special_separation_token] ) processed_bbox.extend(bbox[start:end] + [union_box(bbox[start:end])]) processed_labels.extend(labels[start:end] + [-100]) diff --git a/tests/test_preprocessor.py b/tests/test_preprocessor.py index d96879f..758969f 100644 --- a/tests/test_preprocessor.py +++ b/tests/test_preprocessor.py @@ -5,6 +5,7 @@ from vila.constants import * from vila.dataset.preprocessors.base import SimplePDFDataPreprocessor +from vila.dataset.preprocessors.config import VILAPreprocessorConfig from vila.dataset.preprocessors.layout_indicator import ( BlockLayoutIndicatorPDFDataPreprocessor, RowLayoutIndicatorPDFDataPreprocessor, @@ -48,15 +49,11 @@ use_auth_token=None, ) - -class Config: - pass - - -config = Config() -config.label_all_tokens = False -config.added_special_sepration_token = "[SEP]" -config.group_bbox_agg = "union" +config = VILAPreprocessorConfig( + label_all_tokens = False, + added_special_separation_token = "[SEP]", + group_bbox_agg = "union" +) def test_sentence_indicator_processor(): diff --git a/tools/arguments.py b/tools/arguments.py index 6c4a996..db35653 100644 --- a/tools/arguments.py +++ b/tools/arguments.py @@ -53,7 +53,7 @@ class ModelArguments: ######### VILA Settings ######### ################################# - added_special_sepration_token: str = field( + added_special_separation_token: str = field( default="SEP", metadata={ "help": "The added special token for I-VILA models for separating the blocks/sentences/rows. Can be one of {SEP, BLK}. Default to `SEP`." @@ -72,13 +72,13 @@ class ModelArguments: def __post_init__(self): - assert self.added_special_sepration_token in ["BLK", "SEP"] + assert self.added_special_separation_token in ["BLK", "SEP"] - if self.added_special_sepration_token == "BLK": - self.added_special_sepration_token = "[BLK]" + if self.added_special_separation_token == "BLK": + self.added_special_separation_token = "[BLK]" - if self.added_special_sepration_token == "SEP": - self.added_special_sepration_token = "[SEP]" + if self.added_special_separation_token == "SEP": + self.added_special_separation_token = "[SEP]" @dataclass diff --git a/tools/push_model_to_hf_hub.py b/tools/push_model_to_hf_hub.py index 2ccb85d..2f41cae 100644 --- a/tools/push_model_to_hf_hub.py +++ b/tools/push_model_to_hf_hub.py @@ -29,7 +29,7 @@ def write_json(data, filename): parser.add_argument("--agg_level", type=str, default=None, help="desc") parser.add_argument("--label_all_tokens", type=str, default=None, help="desc") parser.add_argument("--group_bbox_agg", type=str, default=None, help="desc") - parser.add_argument("--added_special_sepration_token", type=str, default=None, help="desc") + parser.add_argument("--added_special_separation_token", type=str, default=None, help="desc") args = parser.parse_args() print(f"Loading Models from {args.model_path}") @@ -58,8 +58,8 @@ def write_json(data, filename): vila_preprocessor_config['label_all_tokens'] = args.label_all_tokens if args.group_bbox_agg is not None: vila_preprocessor_config['group_bbox_agg'] = args.group_bbox_agg - if args.added_special_sepration_token is not None: - vila_preprocessor_config['added_special_sepration_token'] = args.added_special_sepration_token + if args.added_special_separation_token is not None: + vila_preprocessor_config['added_special_separation_token'] = args.added_special_separation_token model_config.vila_preprocessor_config = vila_preprocessor_config diff --git a/tools/train-ivila.py b/tools/train-ivila.py index 5d49ea2..7a8c626 100644 --- a/tools/train-ivila.py +++ b/tools/train-ivila.py @@ -216,8 +216,8 @@ def get_label_list(labels): use_auth_token=True if model_args.use_auth_token else None, ) - if model_args.added_special_sepration_token not in tokenizer.special_tokens_map.values(): - tokenizer.add_special_tokens({"additional_special_tokens": [model_args.added_special_sepration_token]}) + if model_args.added_special_separation_token not in tokenizer.special_tokens_map.values(): + tokenizer.add_special_tokens({"additional_special_tokens": [model_args.added_special_separation_token]}) model.resize_token_embeddings(len(tokenizer)) # In a previous version, we try to avoid resizing the token embeddings by directly # modifying the unused tokens in vocab. However, this is not possible as not all tokenizer @@ -233,7 +233,7 @@ def get_label_list(labels): ) logger.info(f"The used agg level is {data_args.agg_level}") - data_args.added_special_sepration_token = model_args.added_special_sepration_token + data_args.added_special_separation_token = model_args.added_special_separation_token preprocessor = instantiate_dataset_preprocessor( "layout_indicator", tokenizer, data_args ) From 18952f3175a23a0625145ef78ea2e9e8122d8135 Mon Sep 17 00:00:00 2001 From: Shannon Shen <22512825+lolipopshock@users.noreply.github.com> Date: Mon, 27 Jun 2022 16:39:00 -0700 Subject: [PATCH 2/2] Add vila run test --- tests/test_vila_run.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tests/test_vila_run.py diff --git a/tests/test_vila_run.py b/tests/test_vila_run.py new file mode 100644 index 0000000..00146ed --- /dev/null +++ b/tests/test_vila_run.py @@ -0,0 +1,31 @@ +import layoutparser as lp # For visualization + +from vila.pdftools.pdf_extractor import PDFExtractor +from vila.predictors import HierarchicalPDFPredictor, LayoutIndicatorPDFPredictor + +def test_hvila_run(): + + pdf_extractor = PDFExtractor("pdfplumber") + page_tokens, page_images = pdf_extractor.load_tokens_and_image(f"tests/fixtures/large.pdf") + + vision_model = lp.EfficientDetLayoutModel("lp://PubLayNet") + pdf_predictor = HierarchicalPDFPredictor.from_pretrained("allenai/hvila-row-layoutlm-finetuned-docbank") + + for idx, page_token in enumerate(page_tokens): + blocks = vision_model.detect(page_images[idx]) + page_token.annotate(blocks=blocks) + pdf_data = page_token.to_pagedata().to_dict() + predicted_tokens = pdf_predictor.predict(pdf_data, page_token.page_size) + +def test_ivila_run(): + pdf_extractor = PDFExtractor("pdfplumber") + page_tokens, page_images = pdf_extractor.load_tokens_and_image(f"tests/fixtures/large.pdf") + + vision_model = lp.EfficientDetLayoutModel("lp://PubLayNet") + pdf_predictor = LayoutIndicatorPDFPredictor.from_pretrained("allenai/ivila-block-layoutlm-finetuned-docbank") + + for idx, page_token in enumerate(page_tokens): + blocks = vision_model.detect(page_images[idx]) + page_token.annotate(blocks=blocks) + pdf_data = page_token.to_pagedata().to_dict() + predicted_tokens = pdf_predictor.predict(pdf_data, page_token.page_size)