1) reddit submissions: https://files.pushshift.io/reddit/submissions/
2) reddit comment: http://files.pushshift.io/reddit/comments/
zstd - pip install zstandard
nltk - pip install nltk|---data
| |---2010
| | |--- RS_2010-01.zst
| | |--- ...
| | |--- RS_2010-12.zst
| | |--- RC_2010-01.zst
| | |--- ...
| | |--- RC_2010-12.zst
| |---origin
| | |--- rs_2010.csv
| | |--- rc_2010.csv
| | |--- ...
| |---processed
| | |--- dataset1.csv
| | |--- dataset2.csv
| | |--- dataset3.csv$ python data_extract.py --data_path {$DATA_PATH} --subreddit {$SUBREDDIT_NAME} --year {$YEAR} $ python create_dataset1.py --data_path {$DATA_PATH} --subreddit {$SUBREDDIT_NAME} --year {$YEAR} $ python create_dataset2.py --data_path {$DATA_PATH} --subreddit {$SUBREDDIT_NAME} --year {$YEAR} $ python de-identification.py --data_path {$DATA_PATH} --subreddit {$SUBREDDIT_NAME} --year {$YEAR}$ python concat_doc.py --data_path {$DATA_PATH}$ python concat_data.py --data_path {$DATA_PATH}$ python concat_dialog.py --data_path {$DATA_PATH} --subreddit {$SUBREDDIT_NAME} --dtype {$DIALOG_TYPE}