-
Notifications
You must be signed in to change notification settings - Fork 28
Expand file tree
/
Copy pathdistribution_languages.py
More file actions
132 lines (112 loc) Β· 4.16 KB
/
Copy pathdistribution_languages.py
File metadata and controls
132 lines (112 loc) Β· 4.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# CMD: du -sc data/* | sort -h > kilobytes.txt
# read in the data
with open('kilobytes.txt') as f:
data_kb = f.read()
# CMD: wc -l * | sort -hr > line_counts.txt
# wc -l */*.jsonl | sort -h > line_counts.txt
with open('line_counts.txt') as f:
data_l = f.read()
# split the data into lines
kbs = data_kb.strip().split('\n')
ls = data_l.strip().split('\n')
# create a dictionary to store the byte counts for each language
byte_counts = {}
total_bytes = 0
for line in kbs:
parts = line.split()
print(parts)
kb = int(parts[0]) / 1000 # Convert to megabytes
name = parts[1]
byte_counts[name] = kb
if name != "total":
total_bytes += kb
line_counts = {}
total_lines = 0
for line in ls:
parts = line.split()
print(parts)
l = int(parts[1])
name = parts[0].split("/")[-1][:-1]
line_counts[name] = l
if name != "total":
total_lines += l
line_counts["total"] = total_lines
# create the markdown table header
print('| Name | Megabytes | % of total | Samples | % of total |')
print('| --- | --- | --- | --- | --- |')
# create the markdown table rows
# Sort lines acc to byte count
for name, kb in sorted(byte_counts.items(), key=lambda x: x[1], reverse=True):
percentage = round(kb / total_bytes * 100, 4)
ls = line_counts[name]
l_percentage = round(ls / total_lines * 100, 4)
print(f'| {name} | {kb} | {percentage}% | {ls} | {l_percentage}% |')
################
# CMD: du -sc data/* | sort -h > kilobytes.txt
# read in the data
with open('kilobytes_filtered.txt') as f:
data_kb = f.read()
# CMD: wc -l * | sort -hr > line_counts.txt
# wc -l */*.jsonl | sort -h > line_counts.txt
with open('line_counts_filtered.txt') as f:
data_l = f.read()
# split the data into lines
kbs = data_kb.strip().split('\n')
ls = data_l.strip().split('\n')
# create a dictionary to store the byte counts for each language
byte_counts_f = {}
total_bytes_f = 0
for line in kbs:
parts = line.split()
print(parts)
kb = int(parts[0]) / 1000 # Convert to megabytes
name = parts[1]
byte_counts_f[name] = kb
if name != "total":
total_bytes_f += kb
line_counts_f = {}
total_lines_f = 0
for line in ls:
parts = line.split()
print(parts)
l = int(parts[1])
name = parts[0].split("/")[-1][:-1]
line_counts_f[name] = l
if name != "total":
total_lines_f += l
line_counts_f["total"] = total_lines_f
# create the markdown table header
print('| Name | Megabytes | % of total | Samples | % of total |')
print('| --- | --- | --- | --- | --- |')
# create the markdown table rows
# Sort lines acc to byte count
for name, kb in sorted(byte_counts_f.items(), key=lambda x: x[1], reverse=True):
percentage = round(kb / total_bytes_f * 100, 4)
ls = line_counts_f[name]
l_percentage = round(ls / total_lines_f * 100, 4)
print(f'| {name} | {kb} | {percentage}% | {ls} | {l_percentage}% |')
################
# Latex table
print("-"*80)
print(' & \\multicolumn{3}{c}{Raw} & \\multicolumn{3}{c}{Filtered} \\\\')
print('Name & Megabytes & Samples % of total (MB) & Samples & Megabytes & % of total (MB) \\\\')
print('\\midrule')
for name, kb in sorted(byte_counts.items(), key=lambda x: x[1], reverse=True):
percentage = round(kb / total_bytes * 100, 2)
ls = round(line_counts[name], 2)
l_percentage = round(ls / total_lines * 100, 2)
kb_f = round(byte_counts_f.get(name, 0), 2)
percentage_f = round(kb_f / total_bytes_f * 100, 2)
ls_f = round(line_counts_f.get(name, 0), 2)
l_percentage_f = round(ls_f / total_lines_f * 100, 2)
print(f'{name} & {round(kb, 2)} & {ls} & {percentage} & {kb_f} & {ls_f} & {percentage_f} \\\\')
# Print all languages
print("-"*80)
print('", "'.join([x[0] for x in sorted(byte_counts.items(), key=lambda x: byte_counts[x[0]], reverse=True)]))
print("-"*80)
print('", "'.join([x[0] for x in sorted(byte_counts_f.items(), key=lambda x: byte_counts_f[x[0]], reverse=True)]))
# Print all samples
print("-"*80)
print(', '.join([str(x[1]) for x in sorted(line_counts.items(), key=lambda x: line_counts[x[0]], reverse=True)]))
print("-"*80)
print(', '.join([str(line_counts_f.get(x[0], 0)) for x in sorted(line_counts.items(), key=lambda x: line_counts[x[0]], reverse=True)]))