NeotomaDB · tieandrews · Jun 27, 2023 · Jun 27, 2023 · Jun 27, 2023 · Jun 27, 2023
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -10,13 +10,10 @@ services:
     volumes:
       - ./data/data-review-tool:/MetaExtractor/data/data-review-tool
   entity-extraction-pipeline:
-    image: metaextractor-entity-extraction-pipeline:v0.0.2
+    image: metaextractor-entity-extraction-pipeline:v0.0.3
     build: 
       dockerfile: ./docker/entity-extraction-pipeline/Dockerfile
       context: .
-      args:
-        HF_NER_MODEL_NAME: "roberta-finetuned-v3"
-        SPACY_NER_MODEL_NAME: "spacy-transformer-v3"
     ports:
       - "5000:5000"
     volumes:
@@ -26,4 +23,4 @@ services:
       - USE_NER_MODEL_TYPE=huggingface
       - LOG_OUTPUT_DIR=/outputs/
       - MAX_SENTENCES=20
-      - MAX_ARTICLES=1 
+      - MAX_ARTICLES=1
diff --git a/docker/data-review-tool/README.md b/docker/data-review-tool/README.md
@@ -2,15 +2,22 @@
 
 This docker image contains `Finding Fossils`, a data review tool built using Dash, Python. It is used to visualize the outputs of the models and verify the extracted entities for inclusion in the Neotoma Database. 
 
-## Docker Compose Setup
+The expected inputs are mounted onto the newly created container as volumes and can be dumped in any folder. An environment variable is setup to provide the path to this folder. It assumes the following:
+1. A parquet file containing the outputs from the article relevance prediction component.
+2. A zipped file containing the outputs from the named entity extraction component.
+3. Once the articles have been verified we update the same parquet file referenced using the environment variable `ARTICLE_RELEVANCE_BATCH` with the entities verified by the steward and the status of review for the article.
 
-We first build the docker image to install the required dependencies that can be run using `docker-compose` as follows:
-```bash
-docker-compose build
-docker-compose up data-review-tool
-```
+## Additional Options Enabled by Environment Variables
+
+The following environment variables can be set to change the behavior of the pipeline:
+- `ARTICLE_RELEVANCE_BATCH`: This variable gives the name of the article relevance output parquet file.
+- `ENTITY_EXTRACTION_BATCH`: This variable gives the name of the entity extraction compressed output file.
+
+## Sample Docker Compose Setup
+
+Update the environment variables and the volume paths defined under the `data-review-tool` service in the `docker-compose.yml` file under the root directory. The volume paths are:
 
-This is the basic docker compose configuration for running the image.
+`INPUT_PATH`: The path to the directory where the data is dumped. eg. `./data/data-review-tool` (recommended)
 
 ```yaml
 version: "3.9"
@@ -21,13 +28,13 @@ services:
     ports:
       - "8050:8050"
     volumes:
-    - ./data/data-review-tool:/MetaExtractor/data/data-review-tool
+    - {INPUT_PATH}:/MetaExtractor/inputs
+    environment:
+    - ARTICLE_RELEVANCE_BATCH=sample_parquet_output.parquet
+    - ENTITY_EXTRACTION_BATCH=sample_ner_output.zip
+```
+Then build and run the docker image to install the required dependencies using `docker-compose` as follows:
+```bash
+docker-compose build
+docker-compose up data-review-tool
 ```
-
-### Input
-The expected inputs are mounted onto the newly created container as volumes and can be dumped in the `data/data-review-tool` folder. The artifacts required by the data review tool to verify a batch of processed articles are:
-- A parquet file containing the outputs from the article relevance prediction component.
-- A zipped file containing the outputs from the named entity extraction component.
-
-### Output
-Once the articles have been verified and the container has been destroyed, we update the same parquet file referenced in the `Input` with the extracted (predicted by the model) and verified (correct by data steward) entities.
diff --git a/docker/entity-extraction-pipeline/Dockerfile b/docker/entity-extraction-pipeline/Dockerfile
@@ -10,25 +10,23 @@ COPY docker/entity-extraction-pipeline/requirements.txt .
 # Install the required Python packages
 RUN pip install --no-cache-dir -r requirements.txt
 RUN python -m nltk.downloader stopwords
+RUN pip install https://huggingface.co/finding-fossils/metaextractor-spacy/resolve/main/en_metaextractor_spacy-any-py3-none-any.whl
+# install git-lfs to be able to clone model weights from huggingface
+RUN apt-get update && apt-get install -y git-lfs
+# download the HF model into /app/models/ner/metaextractor
+RUN mkdir -p ./models/ner/ \
+    && cd ./models/ner/ \
+    && git lfs install \
+    && git clone https://huggingface.co/finding-fossils/metaextractor
 
 # Copy the entire repository folder into the container
 COPY src ./src
 
-# Build args
-ARG HF_NER_MODEL_NAME
-ARG SPACY_NER_MODEL_NAME
-
-# Set env variables for when running the container
-ENV HF_NER_MODEL_NAME=${HF_NER_MODEL_NAME}
-ENV SPACY_NER_MODEL_NAME=${SPACY_NER_MODEL_NAME}
+# Set default env variables for when running the container
 ENV USE_NER_MODEL_TYPE=huggingface
 ENV MAX_ARTICLES=-1
 ENV MAX_SENTENCES=-1
 
-# Copy in the model defined by the env variable NER_MODEL_NAME from models folder
-COPY models/ner/${HF_NER_MODEL_NAME} ./models/ner/${HF_NER_MODEL_NAME} 
-COPY models/ner/${SPACY_NER_MODEL_NAME} ./models/ner/${SPACY_NER_MODEL_NAME}
-
 # non-root user control inspired from here: https://stackoverflow.com/questions/66349101/docker-non-root-user-does-not-have-writing-permissions-when-using-volumes
 # Create a non-root user that owns the input/outputs directory by default
 RUN useradd -r extraction-user          # no specific user ID

diff --git a/docker/entity-extraction-pipeline/README.md b/docker/entity-extraction-pipeline/README.md
@@ -6,41 +6,81 @@ This docker image contains the models and code required to run entity extraction
 2. The raw input data is mounted as a volume to the docker folder `/app/inputs/`
 3. The expected output location is mounted as a volume to the docker folder `/app/outputs/`
 4. A single JSON file per article is exported into the output folder along with a `.log` file for the processing run.
-5. An environment variable `LOG_OUTPUT_DIR` is set to the path of the output folder. This is used to write the log file. Default is the directory from which the docker container is run.
 
 ## Additional Options Enabled by Environment Variables
 
 The following environment variables can be set to change the behavior of the pipeline:
 - `USE_NER_MODEL_TYPE`: This variable can be set to `spacy` or `huggingface` to change the NER model used. The default is `huggingface`. This will be used to run batches with each model to evaluate final performance.
+- `HF_NER_MODEL_NAME`: The name of the `huggingface-hub` repository hosting the huggingface model artifacts.
+- `SPACY_NER_MODEL_NAME`: The name of the `huggingface-hub` repository hosting the spacy model artifacts.
 - `MAX_SENTENCES`: This variable can be set to a number to limit the number of sentences processed per article. This is useful for testing and debugging. The default is `-1` which means no limit.
 - `MAX_ARTICLES`: This variable can be set to a number to limit the number of articles processed. This is useful for testing and debugging. The default is `-1` which means no limit.
+- `LOG_OUTPUT_DIR`: This variable is set to the path of the output folder to write the log file. Default is the directory from which the docker container is run.
 
-## Sample Docker Run & Compose Setup
+## Testing the Docker Image to Run on xDD
+
+The docker image must be able to be run without root permissions. To test that this is correctly setup, run the following command and ensure it completes without error.
+
+```bash
+docker run -u $(id -u) -p 5000:5000 -v /${PWD}/data/entity-extraction/raw/original_files/:/inputs/ -v /${PWD}/data/entity-extraction/processed/processed_articles/:/outputs/ --env LOG_OUTPUT_DIR="../outputs/" metaextractor-entity-extraction-pipeline:v0.0.3
+```
+
+**Details**:  
+- the $(id -u) is used to run the docker container as the current user so that the output files are not owned by root  
+- the LOG_OUTPUT_DIR="../outputs/" is different from the docker compose as it is relative to the current directory which from Docker run starts in app folder
+- for git bash on windows the /${PWD} is used to get the current directory and the forward slash is important to get the correct path
+
+## Sample Docker Compose Setup
+
+Update the environment variables defined under the `entity-extraction-pipeline` service in the `docker-compose.yml` file under the root directory. The volume paths are:
+- `INPUT_FOLDER`: The folder containing the raw text `nlp352` TSV file, eg. `./data/entity-extraction/raw/original_files/` (recommended)
+- `OUTPUT_FOLDER`: The folder to dump the final JSON files, eg. `./data/entity-extraction/processed/processed_articles/` (recommended)
+
+Then build and run the docker image to install the required dependencies using `docker-compose` as follows:
 
-Below is a sample docker run command for running the image:
-- the `$(id -u)` is used to run the docker container as the current user so that the output files are not owned by root
-- the `LOG_OUTPUT_DIR="../outputs/"` is different from the docker compose as it is relative to the current directory which from Docker run starts in `app` folder
-- for git bash on windows the `/${PWD}` is used to get the current directory and the forward slash is important to get the correct path
 ```bash
-docker run -u $(id -u) -p 5000:5000 -v /${PWD}/data/entity-extraction/raw/original_files/:/inputs/ -v /${PWD}/data/entity-extraction/processed/processed_articles/:/outputs/ --env LOG_OUTPUT_DIR="../outputs/" metaextractor-entity-extraction-pipeline:v0.0.2
+docker-compose build
+docker-compose up entity-extraction-pipeline
 ```
 
 Below is a sample docker compose configuration for running the image:
 ```yaml
 version: "0.0.1"
 services:
   entity-extraction-pipeline:
-    image: metaextractor-entity-extraction-pipeline:v0.0.1
+    image: metaextractor-entity-extraction-pipeline:v0.0.3
     build: 
         ...
     ports:
       - "5000:5000"
     volumes:
-    - ./data/raw/:/app/inputs/
-    - ./data/processed/:/app/outputs/
+    - ./data/entity-extraction/raw/<INPUT_FOLDER>:/inputs/
+    - ./data/entity-extraction/processed/<OUTPUT_FOLDER>:/outputs/
     environment:
       - USE_NER_MODEL_TYPE=huggingface
-      - LOG_OUTPUT_DIR=/app/outputs/
+      - LOG_OUTPUT_DIR=/outputs/
       - MAX_SENTENCES=20
       - MAX_ARTICLES=1
+```
+## Pushing the Docker Image to Docker Hub
+
+To push the docker image to docker hub, first login to docker hub using the following command:
+
+```bash
+docker login
+```
+
+Then tag the docker image with the following two commands:
+
+```bash
+# to update the "latest" tag image
+docker tag metaextractor-entity-extraction-pipeline:v<VERSION NUMBER> <DOCKER HUB USER ID>/metaextractor-entity-extraction-pipeline
+# to upload a specific version tagged image
+docker tag metaextractor-entity-extraction-pipeline:v<VERSION NUMBER> <DOCKER HUB USER ID>/metaextractor-entity-extraction-pipeline:v<VERSION NUMBER>
+```
+
+Finally, push the docker image to docker hub using the following command:
+
+```bash
+docker push <DOCKER HUB USER ID>/metaextractor-entity-extraction-pipeline
 ```
diff --git a/src/entity_extraction/spacy_entity_extraction.py b/src/entity_extraction/spacy_entity_extraction.py
@@ -32,8 +32,13 @@ def spacy_extract_all(
     """
 
     if ner_model == None:
-        logger.info("Empty model passed, return 0 labels.")
-        return []
+        try:
+            import en_metaextractor_spacy
+            ner_model = en_metaextractor_spacy.load()
+        except:
+            logger.error(f"Spacy model en_metaextractor_spacy not found.")
+            logger.info("Empty model passed, return 0 labels.")
+            return []
 
     entities = []
     doc = ner_model(text)

diff --git a/src/pipeline/entity_extraction_pipeline.py b/src/pipeline/entity_extraction_pipeline.py
@@ -1,7 +1,7 @@
 # Author: Ty Andrews
 # Date: 2023-06-05
 """
-Usage: entity_extraction.py --article_text_path=<article_text_path> --output_path=<output_path> [--max_sentences=<max_sentences>] [--max_articles=<max_articles>]
+Usage: entity_extraction_pipeline.py --article_text_path=<article_text_path> --output_path=<output_path> [--max_sentences=<max_sentences>] [--max_articles=<max_articles>]
 
 Options:
 --article_text_path=<article_text_path> The path to the article text data file.
@@ -34,9 +34,11 @@
 load_dotenv(find_dotenv())
 
 # get the MODEL_NAME from environment variables
-HF_NER_MODEL_NAME = os.getenv("HF_NER_MODEL_NAME", "roberta-finetuned-v3")
-SPACY_NER_MODEL_NAME = os.getenv("SPACY_NER_MODEL_NAME", "spacy-transformer-v3")
+HF_NER_MODEL_PATH = os.getenv("HF_NER_MODEL_PATH", "./models/ner/metaextractor")
+SPACY_NER_MODEL_NAME = os.getenv("SPACY_NER_MODEL_NAME", "en_metaextractor_spacy")
 USE_NER_MODEL_TYPE = os.getenv("USE_NER_MODEL_TYPE", "huggingface")
+MAX_SENTENCES = os.getenv("MAX_SENTENCES", "-1")
+MAX_ARTICLES = os.getenv("MAX_ARTICLES", "-1")
 
 logger = get_logger(__name__)
 
@@ -286,7 +288,7 @@ def recreate_original_sentences_with_labels(row):
 def extract_entities(
     article_text_data: pd.DataFrame,
     model_type: str = "huggingface",
-    model_path: str = os.path.join("models", "ner", "roberta-finetuned-v3"),
+    model_path: str = "metaextractor",
 ) -> pd.DataFrame:
     """
     Extracts the entities from the article text data.
@@ -553,31 +555,42 @@ def main():
 
         article_text_data = load_article_text_data(file_path)
 
-        if opt["--max_articles"] is not None and int(opt["--max_articles"]) != -1:
+        if MAX_ARTICLES is not None and int(MAX_ARTICLES) != -1:
             article_text_data = article_text_data[
                 # 7 index used for testing with entities in first couple sentences of article 7
                 article_text_data["gddid"].isin(
                     article_text_data["gddid"].unique()[
-                        0 : 0 + int(opt["--max_articles"])
+                        0 : 0 + int(MAX_ARTICLES)
                     ]
                 )
             ]
+            logger.info(
+                f"Using just a subsample of the data of with {int(MAX_ARTICLES)} articles"
+            )
 
         # if max_sentences is not -1 then only use the first max_sentences sentences
-        if opt["--max_sentences"] is not None and int(opt["--max_sentences"]) != -1:
-            article_text_data = article_text_data.head(int(opt["--max_sentences"]))
+        if MAX_SENTENCES is not None and int(MAX_SENTENCES) != -1:
+            # get just sentence id's for each gdd up to max_sentences
+            article_text_data = article_text_data[
+                article_text_data["sentid"].isin(
+                    article_text_data["sentid"].unique()[0 : int(MAX_SENTENCES)]
+                )
+            ]
+            logger.info(
+                f"Using just a subsample of the data of with {int(MAX_SENTENCES)} sentences"
+            )
 
         for article_gdd in article_text_data["gddid"].unique():
             logger.info(f"Processing GDD ID: {article_gdd}")
 
             article_text = article_text_data[article_text_data["gddid"] == article_gdd]
 
             if USE_NER_MODEL_TYPE == "huggingface":
-                logger.info(f"Using HuggingFace model {HF_NER_MODEL_NAME}")
-                model_path = os.path.join("models", "ner", HF_NER_MODEL_NAME)
+                logger.info(f"Using HuggingFace model {HF_NER_MODEL_PATH}")
+                model_path = HF_NER_MODEL_PATH
             elif USE_NER_MODEL_TYPE == "spacy":
                 logger.info(f"Using Spacy model {SPACY_NER_MODEL_NAME}")
-                model_path = os.path.join("models", "ner", SPACY_NER_MODEL_NAME)
+                model_path = SPACY_NER_MODEL_NAME
             else:
                 raise ValueError(
                     f"Model type {USE_NER_MODEL_TYPE} not supported. Please set MODEL_TYPE to either 'huggingface' or 'spacy'."
@@ -611,6 +624,13 @@ def main():
                 )
                 continue
 
+            # delete the file if it already exists with the article_gdd name
+            if os.path.exists(os.path.join(opt["--output_path"], f"{article_gdd}.json")):
+                os.remove(os.path.join(opt["--output_path"], f"{article_gdd}.json"))
+                logger.warning(
+                    f"Deleted existing file {article_gdd}.json in output directory."
+                )
+
             export_extracted_entities(
                 extracted_entities=pprocessed_entities,
                 output_path=opt["--output_path"],