Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Scrape new data

Scrape new data #40242

Workflow file for this run

name: Scrape new data
on:
repository_dispatch:
types: scrape
# TODO Ideally this will only run when changes to the scraper folder are made
# push:
# branches:
# - master
schedule:
- cron: '0 * * * *'
workflow_dispatch:
inputs:
scrape_all:
description: 'Scrape all semesters (CURRENT_YEAR|ALL_YEARS|OLD_YEARS)'
required: true
type: choice
default: 'CURRENT_YEAR'
options:
- CURRENT_YEAR
- ALL_YEARS
- OLD_YEARS
jobs:
scrape-courses-and-prerequisites:
name: Scrapes courses and prerequisites
runs-on: ubuntu-latest
steps:
- name: Checkout branch
uses: actions/checkout@v3
- name: Set up python
uses: actions/setup-python@v4
with:
python-version: '3.13.3'
- name: Install pip requirements
run: |
python -m pip install --upgrade pip
pip install -r scrapers/requirements.txt
- name: Clone QuACS Data
uses: actions/checkout@v3
with:
repository: quacs/quacs-data
path: scrapers/quacs-data
- name: Copy quacs-data files to correct locations
working-directory: ./scrapers/quacs-data
run: |
cp all_schools.json ../all_schools.json
cp semester_data ../data -r
- name: Scrape courses
working-directory: ./scrapers
run: python3 sis_scraper/main.py ${{ github.event.inputs.scrape_all }}
- name: Upload semester-specific data
uses: actions/upload-artifact@v4
with:
name: courses
path: scrapers/data/
scrape-prereq-graph:
name: Scrapes prerequisite graph
runs-on: ubuntu-latest
needs: [scrape-courses-and-prerequisites]
steps:
- name: Checkout branch
uses: actions/checkout@v3
- name: Set up python
uses: actions/setup-python@v4
with:
python-version: '3.13.3'
- name: Install pip requirements
run: |
python -m pip install --upgrade pip
pip install -r scrapers/requirements.txt
# The prereq graph scraper turns data from all semesters into a single json,
# so we need all semester data in a single directory.
# The output of the SIS scraper is only current semesters,
# so we need to merge the SIS scraper output
# with the data already in the quacs-data repo.
- name: Checkout data repository
uses: actions/checkout@v3
with:
path: "scrapers/data"
repository: "quacs/quacs-data"
ref: "master"
clean: true
token: ${{ secrets.GITHUBTOKEN }}
- name: Get scraped data
uses: actions/download-artifact@v4
with:
path: scrapers
- name: Copy newly-scraped semester data to directory
working-directory: ./scrapers
run: rsync -avz courses/ data/semester_data/
- name: Build prerequisites graph
working-directory: ./scrapers
run: python3 prerequisites_graph/main.py data/semester_data prereq_graph.json
- name: Upload prerequisite graph data
uses: actions/upload-artifact@v4
with:
name: prereq_graph
path: scrapers/prereq_graph.json
# This may need to be updated now that the scrapers have been moved into the same repo as the site
# scrape-degree-requirements:
# name: Scrape degree requirements
# runs-on: ubuntu-latest
# steps:
# - name: Checkout degree planner
# uses: actions/checkout@v3
# with:
# path: 'degree-planner'
# repository: 'quacs/degree-planner'
# - name: Install geckodriver
# run: |
# export GECKODRIVER_VERSION="v0.27.0"
# wget https://github.com/mozilla/geckodriver/releases/download/v0.27.0/geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz
# tar -xvzf geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz
# chmod +x geckodriver
# - name: Set up Python
# uses: actions/setup-python@v4
# with:
# python-version: '3.13.3'
# - name: Set up Rust
# uses: actions-rs/toolchain@v1
# with:
# toolchain: stable
# - name: Install pip requirements
# run: |
# python -m pip install --upgrade pip
# pip install -r degree-planner/requirements.txt
# - name: Populate env
# run: printf "RIN=${{ secrets.RIN }}\nPASSWORD=${{ secrets.PASSWORD }}" > degree-planner/.env
# - name: Scrape courses
# run: |
# cd degree-planner
# python3 scraper.py refresh_data
# - name: Upload data
# uses: actions/upload-artifact@v4
# with:
# name: degree_requirements
# path: degree-planner/*.json
# scrape-hass-pathways:
# name: Scrape HASS pathways
# runs-on: ubuntu-latest
# timeout-minutes: 1
# steps:
# - name: Checkout branch
# uses: actions/checkout@v3
# - name: Set up python
# uses: actions/setup-python@v4
# with:
# python-version: '3.13.3'
# - name: Install pip requirements
# run: |
# python -m pip install --upgrade pip
# pip install -r scrapers/requirements.txt
# - name: Scrape HASS pathways
# working-directory: ./scrapers
# run: python3 hass_pathways_scraper/main.py > hass_pathways.json
# - name: Upload data
# uses: actions/upload-artifact@v4
# with:
# name: hass_pathways
# path: scrapers/hass_pathways.json
scrape-faculty:
name: Scrape faculty
runs-on: ubuntu-latest
steps:
- name: Checkout branch
uses: actions/checkout@v3
- name: Set up python
uses: actions/setup-python@v4
with:
python-version: '3.13.3'
- name: Install pip requirements
run: |
python -m pip install --upgrade pip
pip install -r scrapers/requirements.txt
- name: Scrape faculty
working-directory: ./scrapers
run: python3 faculty_directory_scraper/main.py
- name: Upload data
uses: actions/upload-artifact@v4
with:
name: faculty
path: scrapers/faculty.json
# This is broken, commenting out so the job does not continue to fail
# scrape-transfer:
# name: Scrape transfer
# runs-on: ubuntu-latest
# steps:
# - name: Checkout branch
# uses: actions/checkout@v3
# - name: Set up python
# uses: actions/setup-python@v4
# with:
# python-version: '3.13.3'
# - name: Install pip requirements
# run: |
# python -m pip install --upgrade pip
# pip install -r scrapers/requirements.txt
# - name: Scrape transfer
# working-directory: ./scrapers
# run: python3 transfer_scraper/main.py csv
# - name: Upload JSON data
# uses: actions/upload-artifact@v4
# with:
# name: transfer
# path: scrapers/transfer.json
# - name: Upload CSV data
# uses: actions/upload-artifact@v4
# with:
# name: transfer_guides
# path: scrapers/transfer_guides
# This may need to be updated now that the scrapers have been moved into the same repo as the site
scrape-catalog:
name: Scrapes catalog by year
runs-on: ubuntu-latest
steps:
- name: Checkout branch
uses: actions/checkout@v3
- name: Set up python
uses: actions/setup-python@v4
with:
python-version: '3.13.3'
- name: Install pip requirements
run: |
python -m pip install --upgrade pip
pip install -r scrapers/requirements.txt
- name: Scrape catalog
working-directory: ./scrapers
run: python3 catalog_scraper/main.py ${{ github.event.inputs.scrape_all }}
- name: Upload data
uses: actions/upload-artifact@v4
with:
name: catalog
path: scrapers/data
scrape-csci-topics:
name: Scrape CSCI topics courses
runs-on: ubuntu-latest
steps:
- name: Checkout branch
uses: actions/checkout@v3
- name: Set up python
uses: actions/setup-python@v4
with:
python-version: '3.13.3'
- name: Intall pip requirements
run: |
python -m pip install --upgrade pip
pip install -r scrapers/requirements.txt
- name: Scrape csci topics descriptions from Goldschmidt's website
working-directory: ./scrapers
run: python3 csci_topics_scraper/main.py ${{ github.event.inputs.scrape_all }}
- name: Upload data
uses: actions/upload-artifact@v4
with:
name: csci_topics
path: scrapers/data
commit-data:
name: Commit changes
runs-on: ubuntu-latest
needs:
- scrape-courses-and-prerequisites
- scrape-faculty
#- scrape-hass-pathways
- scrape-prereq-graph
- scrape-catalog
# - scrape-csci-topics
# - scrape-transfer
if: |
always()
&& contains(needs.scrape-courses-and-prerequisites.result,'success')
&& contains(needs.scrape-prereq-graph.result,'success')
steps:
- name: Checkout data repository
uses: actions/checkout@v3
with:
path: "scrapers/data"
repository: "quacs/quacs-data"
ref: "master"
clean: true
token: ${{ secrets.GITHUBTOKEN }}
- name: Get scraped data
uses: actions/download-artifact@v4
with:
path: scrapers
- name: Commit new data
working-directory: ./scrapers
run: |
for directory in $(find courses/* -type d -maxdepth 0 -print0 | xargs -0); do
DIR_BASENAME=$(basename "$directory")
rsync -avzh --ignore-missing-args catalog/$DIR_BASENAME/* "$directory" || true
done
rsync -avzh courses/ data/semester_data/
rsync -avzh prereq_graph/prereq_graph.json data/
rsync -avzh --ignore-missing-args faculty/faculty.json data || true
rsync -avzh --ignore-missing-args hass_pathways/hass_pathways.json data || true
rsync -avzh --ignore-missing-args transfer/transfer.json data || true
rsync -avzh --ignore-missing-args transfer_guides/ data/transfer_guides/ || true
rsync -avzh --ignore-missing-args csci_topics/ data/semester_data/ || true
cd data
# Generate catalogs for any folders missing catalogs
# This is to minimize build failures when a semester transition occurs.
for directory in $(find semester_data/* -type d | xargs -0); do
if [ ! -f "$directory/catalog.json" ]; then
echo "Generating $directory/catalog.json"
echo "{}" > $directory/catalog.json
fi
done
# Merge SIS catalog + hard_coded + csci_topics data into the final values
for directory in $(find semester_data/* -type d | grep -v "hard_coded" | xargs -0); do
if [ -f "$directory/catalog_sis.json" ]; then
echo "$directory/catalog_sis.json"
jq -a -s '.[0] * .[1]' "$directory/catalog_sis.json" "$directory/catalog.json" > "$directory/temp_catalog.json"
mv "$directory/temp_catalog.json" "$directory/catalog.json"
rm "$directory/catalog_sis.json"
truncate -s -1 "$directory/catalog.json" #jq adds a newline at the end of the file that we dont want
fi
for merge_data in {"hard_coded","csci_topics"}; do
if [ -d "$directory/$merge_data" ]; then
if [ -f "$directory/$merge_data/catalog.json" ]; then
echo "$directory/catalog.json"
jq -a -s '.[0] * .[1]' "$directory/catalog.json" "$directory/$merge_data/catalog.json" > "$directory/temp_catalog.json"
mv "$directory/temp_catalog.json" "$directory/catalog.json"
truncate -s -1 "$directory/catalog.json" #jq adds a newline at the end of the file that we dont want
fi
fi
done
done
git config user.name "QuACS" && git config user.email "[email protected]"
echo {\"last_updated\":\"$(date --iso-8601=seconds -u)\"} > meta.json
git add .
git commit -m "$(date -u)" # || exit 0
#git add meta.json
#git commit --amend --no-edit
git push --force
curl -H "Accept: application/vnd.github.everest-preview+json" \
-H "Authorization: token ${{ secrets.GITHUBTOKEN }}" \
--request POST \
--data '{"event_type": "deploy", "client_payload": {"build_args": "-d"}}' \
https://api.github.com/repos/quacs/quacs/dispatches
# datadog-metrics:
# name: Send Datadog metrics
# runs-on: ubuntu-latest
# needs:
# - commit-data
# steps:
# - name: Install Datadog agent
# run: |
# DD_AGENT_MAJOR_VERSION=7 DD_API_KEY=${{ secrets.DATADOG_API_KEY }} DD_SITE="datadoghq.com" bash -c "$(curl -L https://s3.amazonaws.com/dd-agent/scripts/install_script.sh)"
# - name: Checkout branch
# uses: actions/checkout@v3
# - name: Set up python
# uses: actions/setup-python@v4
# with:
# python-version: '3.13.3'
# - name: Install pip requirements
# run: |
# python -m pip install --upgrade pip
# pip install -r scrapers/requirements.txt
# - name: Checkout data repository
# uses: actions/checkout@v3
# with:
# path: "scrapers/data"
# repository: "quacs/quacs-data"
# ref: "master"
# clean: true
# token: ${{ secrets.GITHUBTOKEN }}
# - name: Generate and send metrics
# working-directory: ./scrapers
# run: python3 datadog/send_metrics.py