docker compose updocker ps # should show zookeeper and kafka container up and running- to test kafka
docker exec -it tweeeeedbt-kafka-1 bash- inside the container run
kafka-topics.sh --list --bootstrap-server localhost:9092- Requirements
java 17 python 3.8.10
ps: if you have higher version of either things pyspark might probably not get installed corrected and even if it does it will not work correctly :)
wget https://downloads.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgztar -xvzf spark-3.5.5-bin-hadoop3.tgz
mv spark-3.5.5-bin-hadoop3 ~/sparknano ~/.bashrc # or zshrcexport SPARK_HOME=~/spark
export PATH=$SPARK_HOME/bin:$PATH- installation
sudo apt install postgresql postgresql-contrib- enabling the psl service
sudo systemctl status postgresql
sudo systemctl start postgresql
sudo systemctl enable postgresql- access by switching to default postgres user
sudo -i -u postgrespsql- creating user and granting permission inside the psql shell
-- Create a database
CREATE DATABASE tweedbt;
-- Create a user with a password
CREATE USER <username> WITH PASSWORD '<password>'; -- these values should go into your .env
-- Grant privileges
GRANT ALL PRIVILEGES ON DATABASE mydb TO <username>;- applying the schema
cd DB/
psql -U <username> -d tweedbt -f schema.sql- accessing psql
psql -U <username> -d tweedbt
- \l to list all dbs
- \d to list all relations
- Make sure that each time the old data is flushed
bash reset-kafka.sh- Run each of the following commands in seperate terminals and in same order
spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0 stream_processing.pypython3 consumer_<whatever>.pypython3 producer.py- Data Visualization Layer
- Folder Structuring
- System Architecture Diagram
- Dockerize Streaming