Pulling Files in and out
SP_hist.to_csv("/Users/maria/Desktop/csv_files/SP_hist.csv",index=True,
sep=',')
#import the data from the systembolaget API documentation (just down-
loaded the JSON)
with open(("/Users/maria/OneDrive/Documents_old/CodeOpDocs/Milestones/
Group_Project/Drinks_data/assortment.json"), 'r', encoding ='utf-8') as
our_file:
our_file_as_dictionary = json.load(our_file)
# Print the loaded data
print(our_file_as_dictionary)
#Make the dictionary into a DF
api_data=pd.DataFrame(our_file_as_dictionary)
LISTS
#make list of stocks in my universe and in SP500
combined= []
for stock in universe:
if stock in SNP['Symbol'].values:
combined.append(stock)
print(combined)
# I am putting all owners on one line, separted by commas
reporting_owner.groupby('ACCESSION_NUMBER').agg({'RPTOWNERNAME':
','.join})
#remove strange categories like flavoured wine, fruity sparkling
removal_wines=['Sparkling wines, flavored',
'Blå, sparkling',
'Fruit wine',
'Rosé wine',
'Sparkling wines, fruit wine',
'Sweet red wines',
'Rosé wines, Fruity & Flavorful',
'Flavored wine',
'Other fortified wines',
'Sweet white wines', 'MOUSSERANDE VINER röda', 'MOUSSERANDE VINER
röda', 'MOUSSERANDE VINER, smaksatt',
'Blå, mousserande', 'Fruktvin', 'Rosé Wines', 'CHAMPAGNE, söt',
'MOUSSERANDE VINER, fruktvin',
'Red Wines, Sweet', 'Rosé Wines, Fruity & Flavorful',
'Flavored Wine', 'Other Fortified Wines','White Wines, Sweet']
for i in removal_wines:
full_wine = full_wine[full_wine['Headline']!=i]
TUPLES
#pull out data from tuples
vivino_data['match_name']=vivino_data['best_match'].str[0]
vivino_data['match_percentage']= vivino_data['best_match'].str[1]
vivino_data['match_reference']= vivino_data['best_match'].str[2]
DICTIONARIES
Using a dictionary to rename
#aggregate the dictionaries
translation_dict_1.update(another_translation_dict)
translation_dict_1.update(yet_another_dictionary)
#use dictionaries to translate both content and column headers
full_set = full_set.replace(translation_dict_1)
full_set = full_set.rename(columns=translation_dict_1)
DataFrame
#make a copy of the dataframe
piv_table_control=piv_table.copy()
making a column filled with NA
piv_table_control['transacted_avg']=np.nanwor
DF singular calculations
#This is an audit that the number of BAC shares really is that large
result=combo.groupby('ISSUERTRADINGSYMBOL')['TRANS_SHARES'].sum()
bac_count = result[result.index == 'BAC']
bac_count
#calculate how many data points are of each transaction type
combo['TRANS_CODE'].value_counts(True)
#remove stocks where there is no data available
piv_table=piv_table.dropna(how='any',subset=['STOCK_t0'])
# I need to adjust the ones I'm using in graphs to make all
Bershire/Buffett be the same
combo['RPTOWNERNAME']=combo['RPTOWNERNAME'].str.replace('BERKSHIRE
HATHAWAY INC,BUFFETT WARREN E','BUFFETT WARREN E,BERKSHIRE HATHAWAY
INC')
combo
DF Column calculations
#make a field to say whether data in SP500 in the submission data
combo['is_sp500']= combo['ISSUERTRADINGSYMBOL'].isin(combined)
# Calculating cost of transacted shares per line item (to be used later
for getting avg cost of transacted stock)
combo['value_transacted']= combo['TRANS_SHARES'] *
combo['TRANS_PRICEPERSHARE']
Renaming Columns
stock_prices=stock_prices.rename(columns={'variable':'ref_time','value'
:'price'})
#I will calculate the max and min
piv_table['max']= np.amax(piv_table[['STOCK_-
1','STOCK_t0','STOCK_t1','STOCK_t2','STOCK_t3','STOCK_t4','STOCK_t5']],
axis=1)
piv_table['min']= np.amin(piv_table[['STOCK_-
1','STOCK_t0','STOCK_t1','STOCK_t2','STOCK_t3','STOCK_t4','STOCK_t5']],
axis=1)
Groupby
full_wine.groupby('Product_Group_Details')['Units_sold'].sum()
Sort_values
#Order the wine Ascending in Price
full_wine = full_wine.sort_values(by='Actual_Price', ascending=True)
DF FULL calculations
Merge dataframes
# merge dataFrames
combo=submission.merge(non_deriv_trans,on='ACCESSION_NUMBER',
how='inner')
combo=combo.merge(owner_agg,on='ACCESSION_NUMBER', how='inner')
Pivot Table
piv_table_buys=pd.pivot_table(combo_buys,index=['TRANS_DATE','ISSUERTRA
DINGSYMBOL'],aggfunc={'TRANS_SHARES':np.sum,'value_transacted':np.sum})
join df
#join the data again ***HORIZONTAL
piv_table=pd.concat([piv_table_sells,piv_table_buys], axis=0)
iterarrows:
#calc t-1 (day before the transaction)
for idx, row in piv_table.iterrows():
trans_date=row['TRANS_DATE']
symbol= row['ISSUERTRADINGSYMBOL']
try:
value=data_neg_1.loc[trans_date,symbol]
piv_table.loc[idx,"STOCK_-1"]=value
except:
piv_table.loc[idx,"STOCK_-1"]=np.nan
MELT to collapse columns
#make this long form
stock_prices_returns= pd.melt(piv_combined_normalized_prices,
id_vars=['TRANS_DATE', 'ISSUERTRADINGSYMBOL', 'TRANS_SHARES',
'direction', 'is_control',], value_vars=['rTime_0', 'rTime_1',
'rTime_2', 'rTime_3', 'rTime_4', 'rTime_5'])
Index calculations
#We will now make a new index that we can later reference
data_control=data
ref=-1
for idx, row in data_control.iterrows():
ref=ref+1
data_control.loc[idx,"ref_num"]=ref
#we will pull out the dates as our index and replace them with our
ref_num
data_control['Date']=data_control.index
data_control= data_control.set_index('ref_num')
Using Apply:
def bucket(x):
if x > 500:
return 'Over 500'
elif x > 249:
return '250 to 499'
elif x > 199:
return '200 to 249'
elif x > 149:
return '150 to 199'
elif x > 99:
return '100 to 149'
elif x > 74:
return '75-99'
else:
return 'less than 75'
bucket_wine['price_bucket']= bucket_wine['Actual_Price'].apply(bucket)
#using the match reference (as recieved from fuzzy match) to pull out
the correct dictionary and put it into best match
vivino_data['best_match_details']=vivino_data.apply(lambda row:
row['wine_matches'][row['match_reference']], axis=1)
# then use search in the dictionary for the correct field and make that
into a column
vivino_data['vivino_name'] = vivino_data['best_match_details'].ap-
ply(lambda x: x['name'])
vivino_data['vivino_link'] = vivino_data['best_match_details'].ap-
ply(lambda x: x['link'])
vivino_data['vivino_country'] = vivino_data['best_match_details'].ap-
ply(lambda x: x['country'])
vivino_data['vivino_region'] = vivino_data['best_match_details'].ap-
ply(lambda x: x['region'])
vivino_data['vivino_average_rating'] = vivino_data['best_match_de-
tails'].apply(lambda x: x['average_rating'])
vivino_data['vivino_price'] = vivino_data['best_match_details'].ap-
ply(lambda x: x['price'])
DateTime
# Putting date to datetime format
combo['TRANS_DATE']=pd.to_datetime(combo['TRANS_DATE'],format="%d-%b-
%Y")
#remove time-zone data from 'data' so it can be indexed
data=data.tz_convert(None)
shifting
#getting stock prices in the day before and the 5 days after
data_neg_1=data.shift(periods=1)
YFinance
#pull in stock data from what was Yahoo Finance
data = yf.download(stock_selection, period="1y")
Statistics
stats.mannwhitneyu(Experiment_returns, Control_returns,
alternative='less')
Random numbers
#we will now go back to pivot_table_control to give some random dates
to our transactions
#we will now assign a random time variable within our reference range
of dates from 150-216 correponding to ref and the period sampled.
random.seed(42)
for idx, row in piv_table_control.iterrows():
a=random.randrange(start=150, stop=216, step=1)
piv_table_control.loc[idx,'random_date_index']=int(a)
Working with JS files
#unfortunately the data is in string format and i need to change them
all to dictionary so that i can put them in a dataframe
#as running it took forever, i will instead have to transform the data
unfortunately using a different package....
vivino_data['wine_matches']=vivino_data['wine_matches'].apply(lambda x:
ast.literal_eval(x))
# Define the command to run the Node.js script
def retrieve_wine_matches(wine_name):
command = [
"node",
"C:/Users/maria/OneDrive/Documents_old/CodeOpDocs/Milestones/Group_Proj
ect/Vivino_api/vivino-api/vivino.js",
f"--name='{wine_name}'"
]
try:
result = subprocess.run(command, capture_output=True,
text=True, check=True)
print("Node.js script output:", result.stdout)
# Parse the output JSON file
with
open("C:/Users/maria/OneDrive/Documents_old/CodeOpDocs/Milestones/
Group_Project/Drinks_data/vivino-out.json", "r", encoding="utf-8", er-
rors="ignore") as f:
data = json.load(f)
return data['vinos']
except subprocess.CalledProcessError:
data = np.nan
return data
FUZZ Data
def find_best_match(row):
dataf_choices = pd.DataFrame(row['wine_matches'])
the_name = row['full_name']
if 'name' in dataf_choices.columns:
match = process.extractOne(the_name, dataf_choices['name'],
scorer=fuzz.token_set_ratio)
return match
else:
return np.nan
API
REST API, SOAP, RPC, U
#unique values from the "Bottle Type" column as a list
unique_values_list = df['Bottle Type'].dropna().unique().tolist()
# Create a new DataFrame from the unique values list
unique_values_df = pd.DataFrame(unique_values_list, columns=['Bottle Type'])
print(unique_values_df)column_to_translate = list(unique_values_df["Bottle
Type"].unique()
# Your DeepL API key (replace with your own API key)
DEEPL_API_KEY = '***'
# Replace with your actual DeepL API
keydl_translator = Translator(DEEPL_API_KEY)
#put unique values in a dictionary as keys
translation_dict = {}
for type_name in column_to_translate:
if type_name.strip() != "": # Skip if the value is empty or just whitespace
try:
# Translate the text and extract the actual translated text from the TextResult
object
translated_result = dl_translator.translate_text(type_name, target_lang="EN-
US").text
if hasattr(translated_result, 'text'):
translation_dict[type_name] = translated_result # If it's a TextResult object
else:
translation_dict[type_name] = translated_result # If it's already a string
except deepl.DeepLException as e:
print(f"Error translating value '{type_name}': {e}")
translation_dict[type_name] = type_name
else:
# If the type_name is empty, keep it empty in the dictionary
translation_dict[type_name] = type_name# Display the translation dictionary
print("Translation dictionary:")
print(translation_dict)
Unit Tests- with a coverage of 98%
Can also do unit tests in PyTest (pip install pytest)
In the repository
Src-
Tests-
Data
examples