Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 516c417

Browse files
Copilotjosix
andcommitted
Implement terminology extraction tools and generate translation dictionaries
Co-authored-by: josix <[email protected]>
1 parent c63f0d6 commit 516c417

File tree

6 files changed

+18528
-1
lines changed

6 files changed

+18528
-1
lines changed

.scripts/README.md

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,34 @@
22

33
Useful scripts for the translation.
44

5+
## Translation Dictionary Generation
6+
7+
Extract and build a translation dictionary for terminologies across different .po files to maintain consistency.
8+
9+
### extract_terminology.py
10+
Main script that processes all .po files and extracts terminology:
11+
12+
```sh
13+
python3 .scripts/extract_terminology.py
14+
```
15+
16+
Generates `terminology_dictionary.csv` with all extracted terms and their translations.
17+
18+
### create_focused_dictionary.py
19+
Creates a curated dictionary focusing on the most important Python terminology:
20+
21+
```sh
22+
python3 .scripts/create_focused_dictionary.py
23+
```
24+
25+
Generates `focused_terminology_dictionary.csv` with categorized high-priority terms.
26+
27+
See the terminology documentation for detailed usage and integration with translation workflow.
28+
529
## From Google Translation
630

731
Translate all untranslated entries of the given .po file with Google Translate.
832

9-
1033
```sh
1134
.scripts/google_translate.sh library/csv.po
1235
```

.scripts/create_focused_dictionary.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Create a focused terminology dictionary for the most important Python terms.
4+
5+
This script extracts the most critical Python terminology for translation consistency.
6+
"""
7+
8+
import csv
9+
from collections import defaultdict, Counter
10+
11+
12+
def create_focused_dictionary():
13+
"""Create a focused dictionary with the most important terms."""
14+
15+
# Read the full terminology dictionary
16+
important_terms = []
17+
18+
with open("terminology_dictionary.csv", 'r', encoding='utf-8') as csvfile:
19+
reader = csv.DictReader(csvfile)
20+
21+
for row in reader:
22+
source_term = row['source_term'].strip()
23+
frequency = int(row['frequency'])
24+
files_count = int(row['files_count'])
25+
26+
# Focus on high-priority terms
27+
is_important = False
28+
29+
# High priority: Python built-in types and keywords
30+
if source_term.lower() in {
31+
'class', 'function', 'method', 'module', 'package', 'object', 'type',
32+
'int', 'str', 'list', 'dict', 'tuple', 'set', 'float', 'bool', 'complex',
33+
'none', 'true', 'false', 'return', 'import', 'def', 'async', 'await',
34+
'lambda', 'yield', 'raise', 'try', 'except', 'finally', 'with', 'as'
35+
}:
36+
is_important = True
37+
38+
# High priority: Common Python concepts
39+
elif any(concept in source_term.lower() for concept in [
40+
'exception', 'error', 'iterator', 'generator', 'decorator', 'property',
41+
'classmethod', 'staticmethod', 'metaclass', 'inheritance', 'polymorphism'
42+
]):
43+
is_important = True
44+
45+
# High priority: Terms that appear in many files (widespread usage)
46+
elif files_count >= 20 and frequency >= 10:
47+
is_important = True
48+
49+
# Medium priority: Code elements in backticks
50+
elif '`' in source_term or source_term.startswith('__') and source_term.endswith('__'):
51+
is_important = True
52+
53+
# Medium priority: Terms with technical patterns
54+
elif any(pattern in source_term for pattern in ['()', 'Error', 'Exception', 'Class']):
55+
is_important = True
56+
57+
if is_important:
58+
important_terms.append(row)
59+
60+
# Sort by frequency (most common first)
61+
important_terms.sort(key=lambda x: int(x['frequency']), reverse=True)
62+
63+
# Write focused dictionary
64+
with open("focused_terminology_dictionary.csv", 'w', newline='', encoding='utf-8') as csvfile:
65+
fieldnames = ['source_term', 'translated_term', 'frequency', 'files_count',
66+
'priority', 'category', 'example_files']
67+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
68+
69+
writer.writeheader()
70+
71+
for term_data in important_terms:
72+
source_term = term_data['source_term'].strip()
73+
74+
# Categorize the term
75+
category = 'Other'
76+
priority = 'Medium'
77+
78+
if source_term.lower() in {
79+
'class', 'function', 'method', 'module', 'package', 'object', 'type'
80+
}:
81+
category = 'Core Concepts'
82+
priority = 'High'
83+
elif source_term.lower() in {
84+
'int', 'str', 'list', 'dict', 'tuple', 'set', 'float', 'bool', 'complex'
85+
}:
86+
category = 'Built-in Types'
87+
priority = 'High'
88+
elif source_term.lower() in {
89+
'none', 'true', 'false', 'return', 'import', 'def', 'async', 'await'
90+
}:
91+
category = 'Keywords/Constants'
92+
priority = 'High'
93+
elif 'error' in source_term.lower() or 'exception' in source_term.lower():
94+
category = 'Exceptions'
95+
priority = 'High'
96+
elif '`' in source_term:
97+
category = 'Code Elements'
98+
priority = 'Medium'
99+
elif int(term_data['files_count']) >= 50:
100+
category = 'Common Terms'
101+
priority = 'High'
102+
103+
writer.writerow({
104+
'source_term': source_term,
105+
'translated_term': term_data['translated_term'],
106+
'frequency': term_data['frequency'],
107+
'files_count': term_data['files_count'],
108+
'priority': priority,
109+
'category': category,
110+
'example_files': term_data['example_files']
111+
})
112+
113+
print(f"Created focused terminology dictionary with {len(important_terms)} important terms")
114+
115+
# Print category statistics
116+
categories = defaultdict(int)
117+
priorities = defaultdict(int)
118+
119+
for term in important_terms:
120+
source_term = term['source_term'].strip()
121+
if source_term.lower() in {'class', 'function', 'method', 'module', 'package', 'object', 'type'}:
122+
categories['Core Concepts'] += 1
123+
elif source_term.lower() in {'int', 'str', 'list', 'dict', 'tuple', 'set', 'float', 'bool', 'complex'}:
124+
categories['Built-in Types'] += 1
125+
elif source_term.lower() in {'none', 'true', 'false', 'return', 'import', 'def', 'async', 'await'}:
126+
categories['Keywords/Constants'] += 1
127+
elif 'error' in source_term.lower() or 'exception' in source_term.lower():
128+
categories['Exceptions'] += 1
129+
elif '`' in source_term:
130+
categories['Code Elements'] += 1
131+
else:
132+
categories['Common Terms'] += 1
133+
134+
print("\nCategory breakdown:")
135+
for category, count in categories.items():
136+
print(f" {category}: {count} terms")
137+
138+
139+
if __name__ == "__main__":
140+
create_focused_dictionary()

0 commit comments

Comments
 (0)