Scrape new data #40242
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Scrape new data | |
| on: | |
| repository_dispatch: | |
| types: scrape | |
| # TODO Ideally this will only run when changes to the scraper folder are made | |
| # push: | |
| # branches: | |
| # - master | |
| schedule: | |
| - cron: '0 * * * *' | |
| workflow_dispatch: | |
| inputs: | |
| scrape_all: | |
| description: 'Scrape all semesters (CURRENT_YEAR|ALL_YEARS|OLD_YEARS)' | |
| required: true | |
| type: choice | |
| default: 'CURRENT_YEAR' | |
| options: | |
| - CURRENT_YEAR | |
| - ALL_YEARS | |
| - OLD_YEARS | |
| jobs: | |
| scrape-courses-and-prerequisites: | |
| name: Scrapes courses and prerequisites | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout branch | |
| uses: actions/checkout@v3 | |
| - name: Set up python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.13.3' | |
| - name: Install pip requirements | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install -r scrapers/requirements.txt | |
| - name: Clone QuACS Data | |
| uses: actions/checkout@v3 | |
| with: | |
| repository: quacs/quacs-data | |
| path: scrapers/quacs-data | |
| - name: Copy quacs-data files to correct locations | |
| working-directory: ./scrapers/quacs-data | |
| run: | | |
| cp all_schools.json ../all_schools.json | |
| cp semester_data ../data -r | |
| - name: Scrape courses | |
| working-directory: ./scrapers | |
| run: python3 sis_scraper/main.py ${{ github.event.inputs.scrape_all }} | |
| - name: Upload semester-specific data | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: courses | |
| path: scrapers/data/ | |
| scrape-prereq-graph: | |
| name: Scrapes prerequisite graph | |
| runs-on: ubuntu-latest | |
| needs: [scrape-courses-and-prerequisites] | |
| steps: | |
| - name: Checkout branch | |
| uses: actions/checkout@v3 | |
| - name: Set up python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.13.3' | |
| - name: Install pip requirements | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install -r scrapers/requirements.txt | |
| # The prereq graph scraper turns data from all semesters into a single json, | |
| # so we need all semester data in a single directory. | |
| # The output of the SIS scraper is only current semesters, | |
| # so we need to merge the SIS scraper output | |
| # with the data already in the quacs-data repo. | |
| - name: Checkout data repository | |
| uses: actions/checkout@v3 | |
| with: | |
| path: "scrapers/data" | |
| repository: "quacs/quacs-data" | |
| ref: "master" | |
| clean: true | |
| token: ${{ secrets.GITHUBTOKEN }} | |
| - name: Get scraped data | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: scrapers | |
| - name: Copy newly-scraped semester data to directory | |
| working-directory: ./scrapers | |
| run: rsync -avz courses/ data/semester_data/ | |
| - name: Build prerequisites graph | |
| working-directory: ./scrapers | |
| run: python3 prerequisites_graph/main.py data/semester_data prereq_graph.json | |
| - name: Upload prerequisite graph data | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: prereq_graph | |
| path: scrapers/prereq_graph.json | |
| # This may need to be updated now that the scrapers have been moved into the same repo as the site | |
| # scrape-degree-requirements: | |
| # name: Scrape degree requirements | |
| # runs-on: ubuntu-latest | |
| # steps: | |
| # - name: Checkout degree planner | |
| # uses: actions/checkout@v3 | |
| # with: | |
| # path: 'degree-planner' | |
| # repository: 'quacs/degree-planner' | |
| # - name: Install geckodriver | |
| # run: | | |
| # export GECKODRIVER_VERSION="v0.27.0" | |
| # wget https://github.com/mozilla/geckodriver/releases/download/v0.27.0/geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz | |
| # tar -xvzf geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz | |
| # chmod +x geckodriver | |
| # - name: Set up Python | |
| # uses: actions/setup-python@v4 | |
| # with: | |
| # python-version: '3.13.3' | |
| # - name: Set up Rust | |
| # uses: actions-rs/toolchain@v1 | |
| # with: | |
| # toolchain: stable | |
| # - name: Install pip requirements | |
| # run: | | |
| # python -m pip install --upgrade pip | |
| # pip install -r degree-planner/requirements.txt | |
| # - name: Populate env | |
| # run: printf "RIN=${{ secrets.RIN }}\nPASSWORD=${{ secrets.PASSWORD }}" > degree-planner/.env | |
| # - name: Scrape courses | |
| # run: | | |
| # cd degree-planner | |
| # python3 scraper.py refresh_data | |
| # - name: Upload data | |
| # uses: actions/upload-artifact@v4 | |
| # with: | |
| # name: degree_requirements | |
| # path: degree-planner/*.json | |
| # scrape-hass-pathways: | |
| # name: Scrape HASS pathways | |
| # runs-on: ubuntu-latest | |
| # timeout-minutes: 1 | |
| # steps: | |
| # - name: Checkout branch | |
| # uses: actions/checkout@v3 | |
| # - name: Set up python | |
| # uses: actions/setup-python@v4 | |
| # with: | |
| # python-version: '3.13.3' | |
| # - name: Install pip requirements | |
| # run: | | |
| # python -m pip install --upgrade pip | |
| # pip install -r scrapers/requirements.txt | |
| # - name: Scrape HASS pathways | |
| # working-directory: ./scrapers | |
| # run: python3 hass_pathways_scraper/main.py > hass_pathways.json | |
| # - name: Upload data | |
| # uses: actions/upload-artifact@v4 | |
| # with: | |
| # name: hass_pathways | |
| # path: scrapers/hass_pathways.json | |
| scrape-faculty: | |
| name: Scrape faculty | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout branch | |
| uses: actions/checkout@v3 | |
| - name: Set up python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.13.3' | |
| - name: Install pip requirements | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install -r scrapers/requirements.txt | |
| - name: Scrape faculty | |
| working-directory: ./scrapers | |
| run: python3 faculty_directory_scraper/main.py | |
| - name: Upload data | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: faculty | |
| path: scrapers/faculty.json | |
| # This is broken, commenting out so the job does not continue to fail | |
| # scrape-transfer: | |
| # name: Scrape transfer | |
| # runs-on: ubuntu-latest | |
| # steps: | |
| # - name: Checkout branch | |
| # uses: actions/checkout@v3 | |
| # - name: Set up python | |
| # uses: actions/setup-python@v4 | |
| # with: | |
| # python-version: '3.13.3' | |
| # - name: Install pip requirements | |
| # run: | | |
| # python -m pip install --upgrade pip | |
| # pip install -r scrapers/requirements.txt | |
| # - name: Scrape transfer | |
| # working-directory: ./scrapers | |
| # run: python3 transfer_scraper/main.py csv | |
| # - name: Upload JSON data | |
| # uses: actions/upload-artifact@v4 | |
| # with: | |
| # name: transfer | |
| # path: scrapers/transfer.json | |
| # - name: Upload CSV data | |
| # uses: actions/upload-artifact@v4 | |
| # with: | |
| # name: transfer_guides | |
| # path: scrapers/transfer_guides | |
| # This may need to be updated now that the scrapers have been moved into the same repo as the site | |
| scrape-catalog: | |
| name: Scrapes catalog by year | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout branch | |
| uses: actions/checkout@v3 | |
| - name: Set up python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.13.3' | |
| - name: Install pip requirements | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install -r scrapers/requirements.txt | |
| - name: Scrape catalog | |
| working-directory: ./scrapers | |
| run: python3 catalog_scraper/main.py ${{ github.event.inputs.scrape_all }} | |
| - name: Upload data | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: catalog | |
| path: scrapers/data | |
| scrape-csci-topics: | |
| name: Scrape CSCI topics courses | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout branch | |
| uses: actions/checkout@v3 | |
| - name: Set up python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.13.3' | |
| - name: Intall pip requirements | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install -r scrapers/requirements.txt | |
| - name: Scrape csci topics descriptions from Goldschmidt's website | |
| working-directory: ./scrapers | |
| run: python3 csci_topics_scraper/main.py ${{ github.event.inputs.scrape_all }} | |
| - name: Upload data | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: csci_topics | |
| path: scrapers/data | |
| commit-data: | |
| name: Commit changes | |
| runs-on: ubuntu-latest | |
| needs: | |
| - scrape-courses-and-prerequisites | |
| - scrape-faculty | |
| #- scrape-hass-pathways | |
| - scrape-prereq-graph | |
| - scrape-catalog | |
| # - scrape-csci-topics | |
| # - scrape-transfer | |
| if: | | |
| always() | |
| && contains(needs.scrape-courses-and-prerequisites.result,'success') | |
| && contains(needs.scrape-prereq-graph.result,'success') | |
| steps: | |
| - name: Checkout data repository | |
| uses: actions/checkout@v3 | |
| with: | |
| path: "scrapers/data" | |
| repository: "quacs/quacs-data" | |
| ref: "master" | |
| clean: true | |
| token: ${{ secrets.GITHUBTOKEN }} | |
| - name: Get scraped data | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: scrapers | |
| - name: Commit new data | |
| working-directory: ./scrapers | |
| run: | | |
| for directory in $(find courses/* -type d -maxdepth 0 -print0 | xargs -0); do | |
| DIR_BASENAME=$(basename "$directory") | |
| rsync -avzh --ignore-missing-args catalog/$DIR_BASENAME/* "$directory" || true | |
| done | |
| rsync -avzh courses/ data/semester_data/ | |
| rsync -avzh prereq_graph/prereq_graph.json data/ | |
| rsync -avzh --ignore-missing-args faculty/faculty.json data || true | |
| rsync -avzh --ignore-missing-args hass_pathways/hass_pathways.json data || true | |
| rsync -avzh --ignore-missing-args transfer/transfer.json data || true | |
| rsync -avzh --ignore-missing-args transfer_guides/ data/transfer_guides/ || true | |
| rsync -avzh --ignore-missing-args csci_topics/ data/semester_data/ || true | |
| cd data | |
| # Generate catalogs for any folders missing catalogs | |
| # This is to minimize build failures when a semester transition occurs. | |
| for directory in $(find semester_data/* -type d | xargs -0); do | |
| if [ ! -f "$directory/catalog.json" ]; then | |
| echo "Generating $directory/catalog.json" | |
| echo "{}" > $directory/catalog.json | |
| fi | |
| done | |
| # Merge SIS catalog + hard_coded + csci_topics data into the final values | |
| for directory in $(find semester_data/* -type d | grep -v "hard_coded" | xargs -0); do | |
| if [ -f "$directory/catalog_sis.json" ]; then | |
| echo "$directory/catalog_sis.json" | |
| jq -a -s '.[0] * .[1]' "$directory/catalog_sis.json" "$directory/catalog.json" > "$directory/temp_catalog.json" | |
| mv "$directory/temp_catalog.json" "$directory/catalog.json" | |
| rm "$directory/catalog_sis.json" | |
| truncate -s -1 "$directory/catalog.json" #jq adds a newline at the end of the file that we dont want | |
| fi | |
| for merge_data in {"hard_coded","csci_topics"}; do | |
| if [ -d "$directory/$merge_data" ]; then | |
| if [ -f "$directory/$merge_data/catalog.json" ]; then | |
| echo "$directory/catalog.json" | |
| jq -a -s '.[0] * .[1]' "$directory/catalog.json" "$directory/$merge_data/catalog.json" > "$directory/temp_catalog.json" | |
| mv "$directory/temp_catalog.json" "$directory/catalog.json" | |
| truncate -s -1 "$directory/catalog.json" #jq adds a newline at the end of the file that we dont want | |
| fi | |
| fi | |
| done | |
| done | |
| git config user.name "QuACS" && git config user.email "[email protected]" | |
| echo {\"last_updated\":\"$(date --iso-8601=seconds -u)\"} > meta.json | |
| git add . | |
| git commit -m "$(date -u)" # || exit 0 | |
| #git add meta.json | |
| #git commit --amend --no-edit | |
| git push --force | |
| curl -H "Accept: application/vnd.github.everest-preview+json" \ | |
| -H "Authorization: token ${{ secrets.GITHUBTOKEN }}" \ | |
| --request POST \ | |
| --data '{"event_type": "deploy", "client_payload": {"build_args": "-d"}}' \ | |
| https://api.github.com/repos/quacs/quacs/dispatches | |
| # datadog-metrics: | |
| # name: Send Datadog metrics | |
| # runs-on: ubuntu-latest | |
| # needs: | |
| # - commit-data | |
| # steps: | |
| # - name: Install Datadog agent | |
| # run: | | |
| # DD_AGENT_MAJOR_VERSION=7 DD_API_KEY=${{ secrets.DATADOG_API_KEY }} DD_SITE="datadoghq.com" bash -c "$(curl -L https://s3.amazonaws.com/dd-agent/scripts/install_script.sh)" | |
| # - name: Checkout branch | |
| # uses: actions/checkout@v3 | |
| # - name: Set up python | |
| # uses: actions/setup-python@v4 | |
| # with: | |
| # python-version: '3.13.3' | |
| # - name: Install pip requirements | |
| # run: | | |
| # python -m pip install --upgrade pip | |
| # pip install -r scrapers/requirements.txt | |
| # - name: Checkout data repository | |
| # uses: actions/checkout@v3 | |
| # with: | |
| # path: "scrapers/data" | |
| # repository: "quacs/quacs-data" | |
| # ref: "master" | |
| # clean: true | |
| # token: ${{ secrets.GITHUBTOKEN }} | |
| # - name: Generate and send metrics | |
| # working-directory: ./scrapers | |
| # run: python3 datadog/send_metrics.py |