NeotomaDB · brabbit61 · Jun 28, 2023 · Jun 28, 2023 · Jun 28, 2023 · Jun 28, 2023
diff --git a/README.md b/README.md
@@ -97,7 +97,6 @@ pip install -r requirements.txt
 ```
 
 For conda:
-
 ```bash
 conda install environment.yml
 ```

diff --git a/...ty_extraction/assets/account_creation.png → ...lstudio-instructions/account_creation.png b/...ty_extraction/assets/account_creation.png → ...lstudio-instructions/account_creation.png
diff --git a/...tity_extraction/assets/correct_labels.png → ...belstudio-instructions/correct_labels.png b/...tity_extraction/assets/correct_labels.png → ...belstudio-instructions/correct_labels.png
diff --git a/...entity_extraction/assets/global_index.png → ...labelstudio-instructions/global_index.png b/...entity_extraction/assets/global_index.png → ...labelstudio-instructions/global_index.png
diff --git a/src/entity_extraction/assets/green_tab.png → ...ts/labelstudio-instructions/green_tab.png b/src/entity_extraction/assets/green_tab.png → ...ts/labelstudio-instructions/green_tab.png
diff --git a/src/entity_extraction/assets/labeling.png → assets/labelstudio-instructions/labeling.png b/src/entity_extraction/assets/labeling.png → assets/labelstudio-instructions/labeling.png
diff --git a/...traction/assets/labeling_instructions.png → ...io-instructions/labeling_instructions.png b/...traction/assets/labeling_instructions.png → ...io-instructions/labeling_instructions.png
diff --git a/...n/assets/labeling_instructions_button.png → ...ructions/labeling_instructions_button.png b/...n/assets/labeling_instructions_button.png → ...ructions/labeling_instructions_button.png
diff --git a/...ity_extraction/assets/labelstudio_tab.png → ...elstudio-instructions/labelstudio_tab.png b/...ity_extraction/assets/labelstudio_tab.png → ...elstudio-instructions/labelstudio_tab.png
diff --git a/src/entity_extraction/assets/org_nav.png → assets/labelstudio-instructions/org_nav.png b/src/entity_extraction/assets/org_nav.png → assets/labelstudio-instructions/org_nav.png
diff --git a/src/entity_extraction/assets/settings.png → assets/labelstudio-instructions/settings.png b/src/entity_extraction/assets/settings.png → assets/labelstudio-instructions/settings.png
diff --git a/notebooks/2.0-entity-extraction-baselines.ipynb b/notebooks/2.0-entity-extraction-baselines.ipynb
@@ -38,7 +38,7 @@
     "# ensure that the parent directory is on the path for relative imports\n",
     "sys.path.append(os.path.join(os.path.abspath(''), \"..\"))\n",
     "\n",
-    "from src.entity_extraction.baseline_entity_extraction import (\n",
+    "from src.entity_extraction.prediction.baseline_entity_extraction import (\n",
     "    extract_geographic_coordinates,\n",
     "    extract_region_names,\n",
     "    extract_taxa,\n",
@@ -48,7 +48,7 @@
     "    baseline_extract_all\n",
     ")\n",
     "\n",
-    "from src.entity_extraction.entity_extraction_evaluation import (\n",
+    "from src.entity_extraction.evaluation.entity_extraction_evaluation import (\n",
     "    get_token_labels,\n",
     "    plot_token_classification_report,\n",
     "    calculate_entity_classification_metrics,\n",

diff --git a/notebooks/2.1-entity-extraction-spacy.ipynb b/notebooks/2.1-entity-extraction-spacy.ipynb
@@ -48,9 +48,9 @@
     "# ensure that the parent directory is on the path for relative imports\n",
     "sys.path.append(os.path.join(os.path.abspath(''), \"..\"))\n",
     "\n",
-    "from src.entity_extraction.spacy_entity_extraction import spacy_extract_all\n",
+    "from src.entity_extraction.prediction.spacy_entity_extraction import spacy_extract_all\n",
     "\n",
-    "from src.entity_extraction.entity_extraction_evaluation import (\n",
+    "from src.entity_extraction.evaluation.entity_extraction_evaluation import (\n",
     "    get_token_labels,\n",
     "    plot_token_classification_report,\n",
     "    calculate_entity_classification_metrics,\n",

diff --git a/src/entity_extraction/LabelStudio_README.md b/src/entity_extraction/LabelStudio_README.md
@@ -6,14 +6,15 @@ The finding fossils team setup a privately hosted version of LabelStudio using H
 
 **Table of Contents**
 
-- [Label Studio Setup](#label-studio-setup--usage)
-  - [Create Azure Blob Storage](#create-azure-blob-storage)
-  - [Setup Postgres Database](#setup-postgres-database)
-  - [Setup Label Studio External Storage](#setup-label-studio-external-storage)
-- [Label Studio Usage](#labeling-instructions)
-  - [Account creation](#create-account)
-  - [Navigation](#navigation)
-  - [Labeling](#labeling)
+- [Label Studio Setup \& Usage](#label-studio-setup--usage)
+  - [**Label Studio Setup**](#label-studio-setup)
+    - [**Create Azure Blob Storage**](#create-azure-blob-storage)
+    - [**Setup PostgreSQL Database**](#setup-postgresql-database)
+    - [**Setup Label Studio External Storage**](#setup-label-studio-external-storage)
+  - [**Label Studio Usage**](#label-studio-usage)
+    - [**Account creation**](#account-creation)
+    - [**Navigation**](#navigation)
+    - [**Labeling**](#labeling)
 ---
 ## **Label Studio Setup**
 ---
@@ -77,27 +78,27 @@ Inside the Label Studio instance
 2. Send your profile name to *Ty Andrews* to be added to the **Finding Fossils organization** on Hugging Face
 Email: [email protected], or create a new organization for a different project to work collaboratively with teammates.
 3. Once in the organization, navigate to the organization page from your profile.
-![Organization navigation](../../assets/org_nav.png)
+![Organization navigation](../../assets/labelstudio-instructions/org_nav.png)
 
-4. In the organization page, click the space 
+1. In the organization page, click the space 
 **LabelStudio**.
-![LabelStudio tab](../../assets/labelstudio_tab.png)
+![LabelStudio tab](../../assets/labelstudio-instructions/labelstudio_tab.png)
 
-5. Create a LabelStudio account and record your password in your password manager.
-![Create Account](../../assets/account_creation.png)
+1. Create a LabelStudio account and record your password in your password manager.
+![Create Account](../../assets/labelstudio-instructions/account_creation.png)
 
 ### **Navigation**
 
 1. Open the **Green** project named like **Finding Fossils Labelling - Production** or create a new one.
-![Project tab](../../assets/green_tab.png)
+![Project tab](../../assets/labelstudio-instructions/green_tab.png)
 
 2. Navigate to the settings menu of the project. Here, several options are available to tweak the settings to be compatible for your task,
-![Project Settings](../../assets/settings.png)
+![Project Settings](../../assets/labelstudio-instructions/settings.png)
 
   - Review or create labelling instructions.
-  ![Labeling instructions button](../../assets/labeling_instructions_button.png)
+  ![Labeling instructions button](../../assets/labelstudio-instructions/labeling_instructions_button.png)
   - The instructions look like this:
-  ![Labeling instructions](../../assets/labeling_instructions.png)
+  ![Labeling instructions](../../assets/labelstudio-instructions/labeling_instructions.png)
 
   - Labeling configuration:
 After syncing the buckets, the final step is to define the different categories of entities that the named entity recognition model will be trained to predict. A configuration file is used to define the classes and to initialize the UI components to aid a user label entities. A sample config file has the following tags:
@@ -123,11 +124,11 @@ For general information, [visit LabelStudios templates page.](https://labelstud.
 ### **Labeling**
 
 1. Select the task with **global_index** of 0, the **global index** indicates this is the start of the article and start labelling each task by moving onto the next **global_index** number.
-![Global Index of tasks](../../assets/global_index.png)
+![Global Index of tasks](../../assets/labelstudio-instructions/global_index.png)
 - **Ensure pre-labelled entities are correct and/or fix:** we have tried to auto-tag entities to make this faster but it’s not perfect and this is what we’re improving, so this commonly misses entities or gets them partially right.
 - **Label any missed entities:** these can be things with typos, words being smushed together, etc.
 2. **Using the labelling interface:**
-![Labeling](../../assets/labeling.png)
+![Labeling](../../assets/labelstudio-instructions/labeling.png)
 3. **Correct a pre-labelled entity:**
 - If it’s completely wrong, delete it 
   - Select the label
@@ -136,4 +137,4 @@ For general information, [visit LabelStudios templates page.](https://labelstud.
   - Delete the entity using above
   - Select the correct label and click/drag the correct span of text
 
-  ![Correct Entity](../../assets/correct_labels.png)
+  ![Correct Entity](../../assets/labelstudio-instructions/correct_labels.png)
diff --git a/src/entity_extraction/baseline_evaluation.sh → ...raction/evaluation/baseline_evaluation.sh b/src/entity_extraction/baseline_evaluation.sh → ...raction/evaluation/baseline_evaluation.sh
diff --git a/...xtraction/entity_extraction_evaluation.py → ...valuation/entity_extraction_evaluation.py b/...xtraction/entity_extraction_evaluation.py → ...valuation/entity_extraction_evaluation.py
@@ -19,7 +19,7 @@
 if SRC_PATH not in sys.path:
     sys.path.append(SRC_PATH)
 
-from src.entity_extraction.ner_eval import Evaluator
+from src.entity_extraction.evaluation.ner_eval import Evaluator
 from src.logs import get_logger
 
 logger = get_logger(__name__)

diff --git a/...ng/hf_token_classification/hf_evaluate.py → ...tity_extraction/evaluation/hf_evaluate.py b/...ng/hf_token_classification/hf_evaluate.py → ...tity_extraction/evaluation/hf_evaluate.py
@@ -28,7 +28,7 @@
     os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, os.pardir)
 )
 
-from src.entity_extraction.entity_extraction_evaluation import (
+from src.entity_extraction.evaluation.entity_extraction_evaluation import (
     calculate_entity_classification_metrics,
     plot_token_classification_report,
     generate_classification_results,

diff --git a/src/entity_extraction/ner_eval.py → src/entity_extraction/evaluation/ner_eval.py b/src/entity_extraction/ner_eval.py → src/entity_extraction/evaluation/ner_eval.py
diff --git a/...hf_token_classification/run_evaluation.sh → .../evaluation/run_huggingface_evaluation.sh b/...hf_token_classification/run_evaluation.sh → .../evaluation/run_huggingface_evaluation.sh
@@ -14,7 +14,7 @@ export OUTPUT_DIR="$(pwd)/results/ner/test-results/"
 export MODEL_NAME="roberta-finetuned"
 
 # process the labelled files to prepare them for training
-python src/entity_extraction/training/hf_token_classification/hf_evaluate.py \
+python src/entity_extraction/training/huggingface/hf_evaluate.py \
     --data_path "$DATA_DIR" \
     --model_path "$MODEL_PATH" \
     --output_path "$OUTPUT_DIR" \

diff --git a/...tion/training/spacy_ner/run_evaluation.sh → ...action/evaluation/run_spacy_evaluation.sh b/...tion/training/spacy_ner/run_evaluation.sh → ...action/evaluation/run_spacy_evaluation.sh
@@ -15,7 +15,7 @@ export MODEL_NAME="spacy-transformer-v3"
 export GPU=False
 
 # process the labelled files to prepare them for training
-python src/entity_extraction/training/spacy_ner/spacy_evaluate.py \
+python src/entity_extraction/evaluation/spacy_evaluate.py \
     --data_path "$DATA_DIR" \
     --model_path "$MODEL_PATH" \
     --output_path "$OUTPUT_DIR" \

diff --git a/...tion/training/spacy_ner/spacy_evaluate.py → ...y_extraction/evaluation/spacy_evaluate.py b/...tion/training/spacy_ner/spacy_evaluate.py → ...y_extraction/evaluation/spacy_evaluate.py
@@ -26,7 +26,7 @@
     os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, os.pardir)
 )
 
-from src.entity_extraction.entity_extraction_evaluation import (
+from src.entity_extraction.evaluation.entity_extraction_evaluation import (
     calculate_entity_classification_metrics,
     plot_token_classification_report,
     generate_confusion_matrix,
@@ -38,9 +38,6 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-opt = docopt(__doc__)
-
-
 def get_spacy_token_labels(labelled_entities, raw_text):
     """
     Returns a list of labels per token in the raw text from spacy generated labels.
@@ -93,7 +90,7 @@ def get_spacy_token_labels(labelled_entities, raw_text):
     return split_text, token_labels
 
 
-def load_ner_model_pipeline(model_path: str, gpu: bool = False):
+def load_ner_model_pipeline(model_path: str, gpu: str = "False"):
     """
     Loads a spacy named entity recognition model.
 
@@ -206,6 +203,7 @@ def get_labels(ner_model, data):
 
 
 def main():
+    opt = docopt(__doc__)
     # load the model
     model = load_ner_model_pipeline(opt["--model_path"], opt["--gpu"])
     all_predicted_labels = []

diff --git a/...ining/hf_token_classification/__init__.py → src/entity_extraction/prediction/__init__.py b/...ining/hf_token_classification/__init__.py → src/entity_extraction/prediction/__init__.py
diff --git a/..._extraction/baseline_entity_extraction.py → .../prediction/baseline_entity_extraction.py b/..._extraction/baseline_entity_extraction.py → .../prediction/baseline_entity_extraction.py
@@ -12,7 +12,7 @@
 
 
 # ensure that the parent directory is on the path for relative imports
-sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir))
 
 
 def load_taxa_data(

diff --git a/...entity_extraction/hf_entity_extraction.py → ...action/prediction/hf_entity_extraction.py b/...entity_extraction/hf_entity_extraction.py → ...action/prediction/hf_entity_extraction.py
diff --git a/...ity_extraction/spacy_entity_extraction.py → ...ion/prediction/spacy_entity_extraction.py b/...ity_extraction/spacy_entity_extraction.py → ...ion/prediction/spacy_entity_extraction.py
@@ -6,7 +6,7 @@
 import spacy
 
 # ensure that the parent directory is on the path for relative imports
-sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir))
+sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir))
 
 from src.logs import get_logger
 # logger = logging.getLogger(__name__)

diff --git a/src/preprocessing/README.md → ...entity_extraction/preprocessing/README.md b/src/preprocessing/README.md → ...entity_extraction/preprocessing/README.md
@@ -94,7 +94,7 @@ This script takes labelled dataset in JSONLines format as input and splits it in
 The resulting train, validation, and test sets can be used for training and evaluating machine learning models.
 
 #### **Options**
-- `--raw_label_path=<raw_label_path>`: Specify the path to the directory where the raw label files are located.
+- `--raw_label_path=<raw_label_path>`: Specify the path to the directory where the raw label files exported from LabelStudio and the parquet files containing the reviewed entities are located.
 
 - `--output_path=<output_path>`: Specify the path to the directory where the output files will be written.
 
@@ -126,4 +126,4 @@ This script manages the creation of custom data artifacts required for training
 4. Creates the custom data artifacts that can be used for training or fine-tuning spaCy models.
 
 #### **Options**
-- `--data_path=<data_path>`: Specify the path to the folder containing files in JSONLines format.
+- `--data_path=<data_path>`: Specify the path to the folder containing JSON files in txt/json format.
diff --git a/..._classification/huggingface_preprocess.py → ...n/preprocessing/huggingface_preprocess.py b/..._classification/huggingface_preprocess.py → ...n/preprocessing/huggingface_preprocess.py
@@ -25,7 +25,7 @@
 
 logger = get_logger(__name__)
 
-from src.entity_extraction.entity_extraction_evaluation import get_token_labels
+from src.entity_extraction.evaluation.entity_extraction_evaluation import get_token_labels
 
 
 def convert_labelled_data_to_hf_format(
-Original file line number
+Diff line change
@@ Expand Up / @@ -97,7 +97,6 @@ pip install -r requirements.txt @@
     ```
     For conda:
     ```bash
     conda install environment.yml
     ```
@@ Expand Down @@