Page 12 sur 25
Work with phones
phonenumbers library
If you have a phone field and a country code field:
... import phonenumbers ... df['Phone'] = df['Phone'].fillna('') df['CountryCode'] = df['CountryCode'].fillna('') df['TestPhone'] = df.apply( lambda x: phonenumbers.parse(str(x['Phone']), str(x['CountryCode'])) if str(x['Phone']) != '' and str(x['CountryCode']) != '' else '', axis='columns' )
Full phone fix
# FIX PHONE df['Phone'] = df['Phone'].str.strip() df['Phone'] = df['Phone'].replace({'-': ' '}, regex=True) df['Phone'] = df['Phone'].replace({'_': ' '}, regex=True) df['Phone'] = df['Phone'].replace({'\.': ' '}, regex=True) df['Phone'] = df['Phone'].replace({',': ' '}, regex=True) df['Phone'] = df['Phone'].replace({' ': ' '}, regex=True) df['Phone'] = df['Phone'].replace({'[a-zA-Z]': ''}, regex=True) df['Phone'] = df['Phone'].replace({' ': ' '}, regex=True) df['Phone'] = df['Phone'].replace({'^ ': ''}, regex=True) df['Phone'] = df['Phone'].replace({'\*': ''}, regex=True) df['Phone'] = df['Phone'].replace({'\(': ''}, regex=True) df['Phone'] = df['Phone'].replace({'\)': ''}, regex=True) df['Phone'] = df['Phone'].replace({'\$': ''}, regex=True) df['Phone'] = df['Phone'].str.strip() df['Phone'] = df['Phone'].fillna('') df.loc[df['Phone'].str.replace(' ', '').str.len() < 4, 'Phone'] = '' df['CountryCode'] = df['CountryCode'].fillna('') # CREATE TESTPHONE FIELD df['TestPhone'] = '' df = df.reset_index(drop=True) for i, row in df.iterrows(): myPhone = row['Phone'] myCountryCode = row['CountryCode'] MyTest = '' try: # IF PHONE EXISTS AND START WITH "+" AND COUNTRY EXISTS if myPhone != '' and myPhone.startswith('+') and myCountryCode != '': MyTest = phonenumbers.parse(myPhone, myCountryCode) # IF PHONE EXISTS AND NOT START WITH "+" AND COUNTRY EXISTS if myPhone != '' and not myPhone.startswith('+') and myCountryCode != '': MyTest = phonenumbers.parse(myPhone, myCountryCode) # IF PHONE EXISTS AND START WITH "+" AND COUNTRY NOT EXISTS if myPhone != '' and myPhone.startswith('+') and myCountryCode == '': MyTest = phonenumbers.parse(myPhone, myCountryCode) # IF PHONE EXISTS AND NOT START WITH "+" AND COUNTRY NOT EXISTS if myPhone != '' and not myPhone.startswith('+') and myCountryCode == '': MyTest = phonenumbers.parse('+' + myPhone, myCountryCode) except: pass df.at[i,'TestPhone'] = MyTest df['AutoCode'] = '' df.loc[df['TestPhone'] != '', 'AutoCode'] = '+' + df['TestPhone'].astype(str).replace({'Country Code: ': ''}, regex=True).replace({'( National Number: ).*': ''}, regex=True) df['AutoNationalPhone'] = '' df.loc[df['TestPhone'] != '', 'AutoNationalPhone'] = df['TestPhone'].astype(str).replace({'^.*? National Number: ': ''}, regex=True).astype(str).replace({'( Leading Zero).*': ''}, regex=True) # CONCATENATION df['Formatted Phone'] = df[['AutoCode', 'AutoNationalPhone']].apply(lambda x: ' '.join(x.dropna()), axis=1).fillna('') df.loc[~df['Formatted Phone'].str.startswith(('+')), 'Formatted Phone'] = df['Phone'] # AUTOCOUNTRYCODE df['AutoCountryCode'] = '' df = df.reset_index(drop=True) for i, row in df.iterrows(): myCountryCode = row['CountryCode'] MyTest = '' try: MyTest = phonenumbers.country_code_for_region(myCountryCode) except: pass df.at[i,'AutoCountryCode'] = MyTest # LAST df.loc[(~df['Formatted Phone'].str.startswith(('+'))) & (df['AutoCountryCode'] != 0) & (df['Phone'] != ''), 'Formatted Phone'] = '+' + df['AutoCountryCode'].astype(str) + ' ' + df['Phone'].replace({'^0': ''}, regex=True) # IS_POSSIBLE_NUMBER df['is_possible_number'] = '' for i, row in df.iterrows(): myPhone = row['Formatted Phone'] MyTest = '' try: MyTest = phonenumbers.is_possible_number(phonenumbers.parse(myPhone, None)) except: pass df.at[i, 'is_possible_number'] = MyTest # IS_VALID_NUMBER df['is_valid_number'] = '' for i, row in df.iterrows(): myPhone = row['Formatted Phone'] MyTest = '' try: MyTest = phonenumbers.is_valid_number(phonenumbers.parse(myPhone, None)) except: pass df.at[i, 'is_valid_number'] = MyTest # DELETE WRONG NUMBER df.loc[(df['is_valid_number'] == False) & (df['is_possible_number'] == False), 'Formatted Phone'] = '' # RE-ORDER df = df[['Phone', 'CountryCode', 'Formatted Phone']] print(tabulate(df.head(30), headers='keys', tablefmt='psql', showindex=False))