1+ # Sebastian Raschka 2014
2+ #
3+ # A Python function to generalize first and last names.
4+ # The typical use case of such a function to merge data that have been collected
5+ # from different sources (e.g., names of soccer players as shown in the doctest.)
6+ #
7+
8+ import unicodedata
9+ import string
10+ import re
11+
12+ def preprocess_names (name , output_sep = ' ' , firstname_output_letters = 1 ):
13+ """
14+ Function that outputs a person's name in the format
15+ <last_name><separator><firstname letter(s)> (all lowercase)
16+
17+ >>> preprocess_names("Samuel Eto'o")
18+ 'etoo s'
19+
20+ >>> preprocess_names("Eto'o, Samuel")
21+ 'etoo s'
22+
23+ >>> preprocess_names("Eto'o,Samuel")
24+ 'etoo s'
25+
26+ >>> preprocess_names('Xavi')
27+ 'xavi'
28+
29+ >>> preprocess_names('Yaya Touré')
30+ 'toure y'
31+
32+ >>> preprocess_names('José Ángel Pozo')
33+ 'pozo j'
34+
35+ >>> preprocess_names('Pozo, José Ángel')
36+ 'pozo j'
37+
38+ >>> preprocess_names('Pozo, José Ángel', firstname_output_letters=2)
39+ 'pozo jo'
40+
41+ >>> preprocess_names("Eto'o, Samuel", firstname_output_letters=2)
42+ 'etoo sa'
43+
44+ >>> preprocess_names("Eto'o, Samuel", firstname_output_letters=0)
45+ 'etoo'
46+
47+ >>> preprocess_names("Eto'o, Samuel", output_sep=', ')
48+ 'etoo, s'
49+
50+ """
51+
52+ # set first and last name positions
53+ last , first = 'last' , 'first'
54+ last_pos = - 1
55+
56+ if ',' in name :
57+ last , first = first , last
58+ name = name .replace (',' , ' ' )
59+ last_pos = 1
60+
61+ spl = name .split ()
62+ if len (spl ) > 2 :
63+ name = '%s %s' % (spl [0 ], spl [last_pos ])
64+
65+ spl1 , * spl2 = name .split ()
66+ '%s %s' % (spl1 , '' .join (spl2 ))
67+
68+ # remove accents
69+ name = '' .join (x for x in unicodedata .normalize ('NFKD' , name ) if x in string .ascii_letters + ' ' )
70+
71+ # get first and last name if applicable
72+ m = re .match ('(?P<first>\w+)\W+(?P<last>\w+)' , name )
73+ if m :
74+ output = '%s%s%s' % (m .group (last ), output_sep , m .group (first )[:firstname_output_letters ])
75+ else :
76+ output = name
77+ return output .lower ().strip ()
78+
79+
80+ if __name__ == "__main__" :
81+ import doctest
82+ doctest .testmod ()
0 commit comments