Scrape new data #40242

Workflow file for this run

	name: Scrape new data
	on:
	repository_dispatch:
	types: scrape
	# TODO Ideally this will only run when changes to the scraper folder are made
	# push:
	# branches:
	# - master
	schedule:
	- cron: '0 * * * *'
	workflow_dispatch:
	inputs:
	scrape_all:
	description: 'Scrape all semesters (CURRENT_YEAR\|ALL_YEARS\|OLD_YEARS)'
	required: true
	type: choice
	default: 'CURRENT_YEAR'
	options:
	- CURRENT_YEAR
	- ALL_YEARS
	- OLD_YEARS

	jobs:
	scrape-courses-and-prerequisites:
	name: Scrapes courses and prerequisites
	runs-on: ubuntu-latest
	steps:
	- name: Checkout branch
	uses: actions/checkout@v3

	- name: Set up python
	uses: actions/setup-python@v4
	with:
	python-version: '3.13.3'

	- name: Install pip requirements
	run: \|
	python -m pip install --upgrade pip
	pip install -r scrapers/requirements.txt

	- name: Clone QuACS Data
	uses: actions/checkout@v3
	with:
	repository: quacs/quacs-data
	path: scrapers/quacs-data

	- name: Copy quacs-data files to correct locations
	working-directory: ./scrapers/quacs-data
	run: \|
	cp all_schools.json ../all_schools.json
	cp semester_data ../data -r

	- name: Scrape courses
	working-directory: ./scrapers
	run: python3 sis_scraper/main.py ${{ github.event.inputs.scrape_all }}

	- name: Upload semester-specific data
	uses: actions/upload-artifact@v4
	with:
	name: courses
	path: scrapers/data/

	scrape-prereq-graph:
	name: Scrapes prerequisite graph
	runs-on: ubuntu-latest
	needs: [scrape-courses-and-prerequisites]
	steps:
	- name: Checkout branch
	uses: actions/checkout@v3

	- name: Set up python
	uses: actions/setup-python@v4
	with:
	python-version: '3.13.3'

	- name: Install pip requirements
	run: \|
	python -m pip install --upgrade pip
	pip install -r scrapers/requirements.txt

	# The prereq graph scraper turns data from all semesters into a single json,
	# so we need all semester data in a single directory.
	# The output of the SIS scraper is only current semesters,
	# so we need to merge the SIS scraper output
	# with the data already in the quacs-data repo.

	- name: Checkout data repository
	uses: actions/checkout@v3
	with:
	path: "scrapers/data"
	repository: "quacs/quacs-data"
	ref: "master"
	clean: true
	token: ${{ secrets.GITHUBTOKEN }}

	- name: Get scraped data
	uses: actions/download-artifact@v4
	with:
	path: scrapers

	- name: Copy newly-scraped semester data to directory
	working-directory: ./scrapers
	run: rsync -avz courses/ data/semester_data/

	- name: Build prerequisites graph
	working-directory: ./scrapers
	run: python3 prerequisites_graph/main.py data/semester_data prereq_graph.json

	- name: Upload prerequisite graph data
	uses: actions/upload-artifact@v4
	with:
	name: prereq_graph
	path: scrapers/prereq_graph.json

	# This may need to be updated now that the scrapers have been moved into the same repo as the site
	# scrape-degree-requirements:
	# name: Scrape degree requirements
	# runs-on: ubuntu-latest
	# steps:
	# - name: Checkout degree planner
	# uses: actions/checkout@v3
	# with:
	# path: 'degree-planner'
	# repository: 'quacs/degree-planner'

	# - name: Install geckodriver
	# run: \|
	# export GECKODRIVER_VERSION="v0.27.0"
	# wget https://github.com/mozilla/geckodriver/releases/download/v0.27.0/geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz
	# tar -xvzf geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz
	# chmod +x geckodriver

	# - name: Set up Python
	# uses: actions/setup-python@v4
	# with:
	# python-version: '3.13.3'

	# - name: Set up Rust
	# uses: actions-rs/toolchain@v1
	# with:
	# toolchain: stable

	# - name: Install pip requirements
	# run: \|
	# python -m pip install --upgrade pip
	# pip install -r degree-planner/requirements.txt

	# - name: Populate env
	# run: printf "RIN=${{ secrets.RIN }}\nPASSWORD=${{ secrets.PASSWORD }}" > degree-planner/.env

	# - name: Scrape courses
	# run: \|
	# cd degree-planner
	# python3 scraper.py refresh_data

	# - name: Upload data
	# uses: actions/upload-artifact@v4
	# with:
	# name: degree_requirements
	# path: degree-planner/*.json

	# scrape-hass-pathways:
	# name: Scrape HASS pathways
	# runs-on: ubuntu-latest
	# timeout-minutes: 1
	# steps:
	# - name: Checkout branch
	# uses: actions/checkout@v3

	# - name: Set up python
	# uses: actions/setup-python@v4
	# with:
	# python-version: '3.13.3'

	# - name: Install pip requirements
	# run: \|
	# python -m pip install --upgrade pip
	# pip install -r scrapers/requirements.txt

	# - name: Scrape HASS pathways
	# working-directory: ./scrapers
	# run: python3 hass_pathways_scraper/main.py > hass_pathways.json

	# - name: Upload data
	# uses: actions/upload-artifact@v4
	# with:
	# name: hass_pathways
	# path: scrapers/hass_pathways.json

	scrape-faculty:
	name: Scrape faculty
	runs-on: ubuntu-latest
	steps:
	- name: Checkout branch
	uses: actions/checkout@v3

	- name: Set up python
	uses: actions/setup-python@v4
	with:
	python-version: '3.13.3'

	- name: Install pip requirements
	run: \|
	python -m pip install --upgrade pip
	pip install -r scrapers/requirements.txt

	- name: Scrape faculty
	working-directory: ./scrapers
	run: python3 faculty_directory_scraper/main.py

	- name: Upload data
	uses: actions/upload-artifact@v4
	with:
	name: faculty
	path: scrapers/faculty.json

	# This is broken, commenting out so the job does not continue to fail
	# scrape-transfer:
	# name: Scrape transfer
	# runs-on: ubuntu-latest
	# steps:
	# - name: Checkout branch
	# uses: actions/checkout@v3

	# - name: Set up python
	# uses: actions/setup-python@v4
	# with:
	# python-version: '3.13.3'

	# - name: Install pip requirements
	# run: \|
	# python -m pip install --upgrade pip
	# pip install -r scrapers/requirements.txt

	# - name: Scrape transfer
	# working-directory: ./scrapers
	# run: python3 transfer_scraper/main.py csv

	# - name: Upload JSON data
	# uses: actions/upload-artifact@v4
	# with:
	# name: transfer
	# path: scrapers/transfer.json

	# - name: Upload CSV data
	# uses: actions/upload-artifact@v4
	# with:
	# name: transfer_guides
	# path: scrapers/transfer_guides

	# This may need to be updated now that the scrapers have been moved into the same repo as the site
	scrape-catalog:
	name: Scrapes catalog by year
	runs-on: ubuntu-latest
	steps:
	- name: Checkout branch
	uses: actions/checkout@v3

	- name: Set up python
	uses: actions/setup-python@v4
	with:
	python-version: '3.13.3'

	- name: Install pip requirements
	run: \|
	python -m pip install --upgrade pip
	pip install -r scrapers/requirements.txt

	- name: Scrape catalog
	working-directory: ./scrapers
	run: python3 catalog_scraper/main.py ${{ github.event.inputs.scrape_all }}

	- name: Upload data
	uses: actions/upload-artifact@v4
	with:
	name: catalog
	path: scrapers/data

	scrape-csci-topics:
	name: Scrape CSCI topics courses
	runs-on: ubuntu-latest
	steps:
	- name: Checkout branch
	uses: actions/checkout@v3

	- name: Set up python
	uses: actions/setup-python@v4
	with:
	python-version: '3.13.3'

	- name: Intall pip requirements
	run: \|
	python -m pip install --upgrade pip
	pip install -r scrapers/requirements.txt

	- name: Scrape csci topics descriptions from Goldschmidt's website
	working-directory: ./scrapers
	run: python3 csci_topics_scraper/main.py ${{ github.event.inputs.scrape_all }}

	- name: Upload data
	uses: actions/upload-artifact@v4
	with:
	name: csci_topics
	path: scrapers/data

	commit-data:
	name: Commit changes
	runs-on: ubuntu-latest
	needs:
	- scrape-courses-and-prerequisites
	- scrape-faculty
	#- scrape-hass-pathways
	- scrape-prereq-graph
	- scrape-catalog
	# - scrape-csci-topics
	# - scrape-transfer
	if: \|
	always()
	&& contains(needs.scrape-courses-and-prerequisites.result,'success')
	&& contains(needs.scrape-prereq-graph.result,'success')
	steps:
	- name: Checkout data repository
	uses: actions/checkout@v3
	with:
	path: "scrapers/data"
	repository: "quacs/quacs-data"
	ref: "master"
	clean: true
	token: ${{ secrets.GITHUBTOKEN }}

	- name: Get scraped data
	uses: actions/download-artifact@v4
	with:
	path: scrapers

	- name: Commit new data
	working-directory: ./scrapers
	run: \|
	for directory in $(find courses/* -type d -maxdepth 0 -print0 \| xargs -0); do
	DIR_BASENAME=$(basename "$directory")
	rsync -avzh --ignore-missing-args catalog/$DIR_BASENAME/* "$directory" \|\| true
	done

	rsync -avzh courses/ data/semester_data/

	rsync -avzh prereq_graph/prereq_graph.json data/

	rsync -avzh --ignore-missing-args faculty/faculty.json data \|\| true

	rsync -avzh --ignore-missing-args hass_pathways/hass_pathways.json data \|\| true

	rsync -avzh --ignore-missing-args transfer/transfer.json data \|\| true

	rsync -avzh --ignore-missing-args transfer_guides/ data/transfer_guides/ \|\| true

	rsync -avzh --ignore-missing-args csci_topics/ data/semester_data/ \|\| true

	cd data

	# Generate catalogs for any folders missing catalogs
	# This is to minimize build failures when a semester transition occurs.
	for directory in $(find semester_data/* -type d \| xargs -0); do
	if [ ! -f "$directory/catalog.json" ]; then
	echo "Generating $directory/catalog.json"
	echo "{}" > $directory/catalog.json
	fi
	done

	# Merge SIS catalog + hard_coded + csci_topics data into the final values
	for directory in $(find semester_data/* -type d \| grep -v "hard_coded" \| xargs -0); do
	if [ -f "$directory/catalog_sis.json" ]; then
	echo "$directory/catalog_sis.json"
	jq -a -s '.[0] * .[1]' "$directory/catalog_sis.json" "$directory/catalog.json" > "$directory/temp_catalog.json"
	mv "$directory/temp_catalog.json" "$directory/catalog.json"
	rm "$directory/catalog_sis.json"
	truncate -s -1 "$directory/catalog.json" #jq adds a newline at the end of the file that we dont want
	fi
	for merge_data in {"hard_coded","csci_topics"}; do
	if [ -d "$directory/$merge_data" ]; then
	if [ -f "$directory/$merge_data/catalog.json" ]; then
	echo "$directory/catalog.json"
	jq -a -s '.[0] * .[1]' "$directory/catalog.json" "$directory/$merge_data/catalog.json" > "$directory/temp_catalog.json"
	mv "$directory/temp_catalog.json" "$directory/catalog.json"
	truncate -s -1 "$directory/catalog.json" #jq adds a newline at the end of the file that we dont want
	fi
	fi
	done
	done

	git config user.name "QuACS" && git config user.email "[email protected]"
	echo {\"last_updated\":\"$(date --iso-8601=seconds -u)\"} > meta.json
	git add .
	git commit -m "$(date -u)" # \|\| exit 0
	#git add meta.json
	#git commit --amend --no-edit

	git push --force
	curl -H "Accept: application/vnd.github.everest-preview+json" \
	-H "Authorization: token ${{ secrets.GITHUBTOKEN }}" \
	--request POST \
	--data '{"event_type": "deploy", "client_payload": {"build_args": "-d"}}' \
	https://api.github.com/repos/quacs/quacs/dispatches


	# datadog-metrics:
	# name: Send Datadog metrics
	# runs-on: ubuntu-latest
	# needs:
	# - commit-data
	# steps:
	# - name: Install Datadog agent
	# run: \|
	# DD_AGENT_MAJOR_VERSION=7 DD_API_KEY=${{ secrets.DATADOG_API_KEY }} DD_SITE="datadoghq.com" bash -c "$(curl -L https://s3.amazonaws.com/dd-agent/scripts/install_script.sh)"

	# - name: Checkout branch
	# uses: actions/checkout@v3

	# - name: Set up python
	# uses: actions/setup-python@v4
	# with:
	# python-version: '3.13.3'

	# - name: Install pip requirements
	# run: \|
	# python -m pip install --upgrade pip
	# pip install -r scrapers/requirements.txt

	# - name: Checkout data repository
	# uses: actions/checkout@v3
	# with:
	# path: "scrapers/data"
	# repository: "quacs/quacs-data"
	# ref: "master"
	# clean: true
	# token: ${{ secrets.GITHUBTOKEN }}

	# - name: Generate and send metrics
	# working-directory: ./scrapers
	# run: python3 datadog/send_metrics.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Scrape new data #40242

Workflow file

Scrape new data #40242

Uh oh!

Workflow file for this run