Miscellaneous
Where is Python? Which version?
Some possibilities, for example:
import platform print('Python version:\n', platform.python_version()) import sys print('\nWhere is Python:') print(sys.executable) import sys print('\nPython version:') print(sys.version)
Create a dataframe from a previous dataframe (copy)
df2 = df1.copy()
Copy data from a dataframe in the clipboard
Very convenient to copy-paste quickly in a text file, CSV, Excel ...
df.to_clipboard(sep=',', index=False, header=None)
Get list from field
MyList = df['My field'].drop_duplicates().dropna().sort_values(ascending=True).tolist() print(MyList)
Get value from a cell
myValue = df.iloc[0]['my_field']
Get column names in a list
listColumn = df.columns.tolist()
Dedupe
To dedupe a dataframe on an Email field keeping only the first duplicate record, use drop_duplicates
.
df['Email'] = df['Email'].str.lower() dfDeduped = df.drop_duplicates(subset=['Email'], keep='first')
Find duplicates
Use duplicated
:
df2 = df1[df1.duplicated(['My field'], keep=False)] print(tabulate(df2.head(10), headers='keys', tablefmt='psql', showindex=False))
Find no-duplicates
df2 = df1[~df1.duplicated(['My field'], keep=False)] print(tabulate(df2.head(10), headers='keys', tablefmt='psql', showindex=False))
Export a picture from a chart
And add a border.
from PIL import Image, ImageOps fig1.savefig('C:/Users/myplot.png', dpi=100) im = Image.open('C:/Users/myplot2.png') bordered = ImageOps.expand(im, border=1, fill=(0, 0, 0)) bordered.save('C:/Users/myplot2.png')
Re-organize columns
df = df[['Field 1', 'Adresse mail', 'Note algorithmique', 'Substitutions']]
Rename dataframe
New_df = pd.DataFrame(Old_df)
Rename columns
From their original name:
df.rename(columns={'Old name 1': 'New name 1', 'Old name 1': 'New name 1'}, inplace=True)
From their position:
df.columns.values[12] = 'New name'
Delete column
df.drop('My field', axis=1, inplace=True)
Format number with space thousand separator
number = df.shape[0] print(format(number, ',').replace(',', ' '), 'customers')
Or:
f"{number_of_rows:,}".replace(',', ' ')
Ranking
To rank according to a field, from highest to lowest:
df['My rank'] = df['My field to rank'].rank(ascending=False)
Avoid duplicate:
df['My rank'] = df['My field to rank'].rank(ascending=False, method='first')
Generate a unique random integer
If you want a unique random integer from 1 included to the number of records included.
howManyRecord = df.shape[0] df['Random number'] = np.random.choice(range(howManyRecord), howManyRecord, replace=False) df.loc[df['Random number'] == 0, 'Random number'] = howManyRecord
Select with condition
df = df.loc[df['Field to filter'] == 'Yes']
Prevent nan
Use keep_default_na=False
:
Data = pd.read_excel(inputExcelFile, sheet_name='Feuil1', engine='openpyxl', keep_default_na=False)
Replace the default NA value
Data = pd.read_excel(inputExcelFile, sheet_name='Feuil1', engine='openpyxl', keep_default_na=False), na_values=['_'])
Get data like a SQL UPDATE
Very usefull to build SQL update queries from a file with the primary key in first column :
file1 = open(MyPath+'FixQueries.sql', 'w', encoding='cp1252') # ÉCRITURE DU FICHIER for i in df.itertuples(index=False): file1.write('UPDATE MyTable SET ' +\ str(i)[7:-1].split(', ', 1)[-1] +\ " WHERE id = '" + i[0] + "' ;\n")
Get data like a SQL INSERT
# SQL TO INSERT IN TARGET for index, row in df.iterrows(): columns = ', '.join(row.index) values = ', '.join(f"'{v}'" for v in row.values) query = f'''INSERT IGNORE INTO ma_table ({columns}) VALUES ({values}) ;''' print(query)
Use index in concatenation
df.index.astype(str)
Reset index
df.reset_index(drop=True, inplace=True)
Drop index
To drop the index, you have to use another field as index.
df = df.set_index('my field')
Open a box to enter a string and use it as a variable
import tkinter as tk from tkinter import simpledialog window = tk.Tk() window.eval('tk::PlaceWindow . center') window.withdraw() repertoireCarto = simpledialog.askstring("Saisie", "Répertoire ?\t\t\t\t", parent=window) print('Le répertoire est', repertoireCarto)
Get some line indexes according condition
list_duplicate = df.index[df['My field'] == 'Text'].tolist()
Import external scripts
Your working script needs to know where to find the external scripts:
sys.path.append('C:\\Users\\Georges\\PycharmProjects\\Your_Directory') from YourScriptWithoutExtension import SomeVariables, ...
Subprocess
Not related to Pandas but very useful to run shell command, bash, console scripts...
import subprocess subprocess.run('psql -h localhost -p 5432 -d work -U postgres -c "SELECT NOW() ;"', shell=True)
And to use pgsql2shp
in subprocess
for shape export:
subprocess.run( \ '''pgsql2shp -f E:\\Path\\to\\MyShape -u postgres -h localhost work "SELECT * FROM MyTable ;"''', \ shell=False, \ executable='E:\\Path\\to\\pgsql2shp.exe' \ )
And to use PowerShell
or OGR2OGR
in subprocess
:
subprocess.run( \ '''OGR2OGR...''', \ shell=False, \ executable='E:\\Path\\to\\powershell.exe' \ )
Strings comparison
Compare the similarity of 2 strings, to get a indicator in a new field:
import difflib from difflib import SequenceMatcher df['indicator'] = df[['Field1', 'Field2']].apply(lambda x: SequenceMatcher(lambda y: y == " ", x[0], x[1]).ratio(), axis=1)
Count time execution
import time ... print('Time for now is %s secondes' % round(time.process_time(), 1)) timeNow = time.process_time() ... print('Time now is %s secondes' % round(time.process_time() - timeNow, 1)) timeNow = time.process_time() ...
Include external Python code (without real importation)
... YourFileCode = r'C:/Users/Georges/PycharmProjects/WorkEMC/YourFileCode.py' ... exec(compile(open(YourFileCode, 'rb').read(), YourFileCode, 'exec'))
Concatenate fields without empty value
df['adresse'] = df[['Filed1', 'Filed2', 'Filed3']].apply(lambda x: ' '.join(x.dropna()), axis=1)
Force data type creating the dataframe
df = pd.read_csv(workDirectory + FileName, sep=';', usecols=fieldsExisting, dtype={'firstname': 'str', 'lastname': 'str'} )
- Tips:
- Sometimes when your data comes from an Excel file and contains percentages,
dtype
will be non-efficient because Excel stores percentages as numbers. There is a solution in the chapter Advanced read/write in Excel of this article, just search Fix Excel percentages.
Fix scientific notation for phone numbers in Pandas, Tabulate and Excel export
Example for phone numbers:
df['Mobile Phone'] = df['Mobile Phone'].astype(str) df['Mobile Phone'] = df['Mobile Phone'].fillna('') df['Mobile Phone'] = df['Mobile Phone'].replace(['nan'], '') df['Mobile Phone'] = df['Mobile Phone'].apply(lambda x: x[:-2] if x.endswith('.0') else x)
Get file/directory date creation
import os import time from time import gmtime, strftime myRep = 'C://Path/of/my/directory' myDateCreation = time.ctime(os.path.getctime(myRep)) print('\n' + str(myDateCreation)) myConvertDateCreation = time.strptime(myDateCreation) print("\nFull:", myConvertDateCreation) print("\nDay:", myConvertDateCreation.tm_mday) print("Month:", myConvertDateCreation.tm_mon) print("Year:", myConvertDateCreation.tm_year) print("Hour:", myConvertDateCreation.tm_hour) print("Minutes:", myConvertDateCreation.tm_min) print(strftime("\n%A %d %B %Y %H:%M", myConvertDateCreation ))
Use a dataframe not yet created in a function
Pass it as global
:
def MyDeletion(): global df ind_drop = df[df['My field'].apply(lambda x: x == ('My value'))].index df = df.drop(ind_drop)
Organize your screens (new, OK for W11 for example)
import subprocess import pygetwindow as gw import time import os import psutil # CLOSE PREVIOUS WINDOWS TARGET = "explorer.exe" [process.kill() for process in psutil.process_iter() if process.name() == TARGET] # OPEN A WINDOW subprocess.Popen('explorer "C:/Users/Downloads"') time.sleep(0.5) # SEE WINDOWS TITLES windowsList = gw.getAllTitles() # DELETE USELESS THINGS IF CONTAINS windowsList = [x for x in windowsList if 'Notepad' not in x] # DELETE USELESS THINGS IF EQUALS windowsList = [element for element in windowsList if element != ''] print('My windows : ' + str(windowsList)) # MOVE A WINDOW for window in windowsList: if window == 'Downloads: File explorer': explorateur = gw.getWindowsWithTitle(window)[0] explorateur.resizeTo(500, 505) # width, height explorateur.moveTo(11, 15) # from left, from right
Organize your screens (old, see below for a better way)
Attention: wait the end of the process for proper screens!
import subprocess import clipboard import pyautogui import pygetwindow as gw from screeninfo import get_monitors # Callback function to find file explorer windows def enum_windows_callback(hwnd, results): if win32gui.GetClassName(hwnd) == 'CabinetWClass': results.append(hwnd) return True # Get all File Explorer windows explorer_windows = [] win32gui.EnumWindows(enum_windows_callback, explorer_windows) # Close every File Explorer window for hwnd in explorer_windows: # Activate window win32gui.ShowWindow(hwnd, win32con.SW_SHOW) win32gui.SetForegroundWindow(hwnd) # Use pyautogui to send Alt+F4 key combination to close window pyautogui.hotkey('alt', 'f4') # Open File Explorer subprocess.call("start explorer", shell=True) # Wait a short while for File Explorer to fully open pyautogui.sleep(1) # Select the desired screen (for example, the first screen) monitors = get_monitors() screen = monitors[0]# Récupérer les coordonnées du coin supérieur gauche de l'écran left = screen.x top = screen.y # Find the File Explorer window explorer_window = gw.getWindowsWithTitle('Explorateur de fichiers')[0] # Desired position on the screen pyautogui.sleep(0.5) x_position = 10 # To the right y_position = 15 # To the down explorer_window.moveTo(x_position, y_position) # Size pyautogui.sleep(0.5) width = 600 height = 600 explorer_window.resizeTo(width, height) # Directory pyautogui.sleep(0.5) directory_path = 'C:\\Users\\Georges\\Downloads' clipboard.copy(directory_path) pyautogui.hotkey('alt', 'a') # pyautogui.typewrite(directory_path) pyautogui.hotkey('ctrl', 'v') pyautogui.press('enter') # pyautogui.click(button='right') # subprocess.call("start notepad", shell=True)C:\Users\Georges\Downloads
Search a string in database
from tabulate import tabulate as tab import pandas as pd from sqlalchemy import create_engine SearchWord = 'anyxxxtube' username = 'XXXXX' password = 'XXXXX' port = 3306 database = 'XXXXX' host = 'XXXXX' engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s' % (username, password, host, port, database)) # SEARCH TABLES AND FIELDS sql = 'SELECT table_name, column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name LIKE "prime_%%" ;' df = pd.read_sql_query(sql, engine) print(tab(df.head(5), headers='keys', tablefmt='psql', showindex=False)) print(df.shape[0]) # BUILD QUERIES Assemblage = list(zip(df['table_name'].tolist(), df['column_name'].tolist())) # for a, b in Assemblage: # print(a, ' - ', b) for a, b in Assemblage: sql = 'SELECT '' + b + '' FROM '' + a + '' WHERE '' + b + '' LIKE "%%' + SearchWord + '%%" ;' df = pd.read_sql_query(sql, engine) if df.shape[0] > 0: print('In table "' + a + '" :') print(tab(df.head(5), headers='keys', tablefmt='psql', showindex=False)) print(df.shape[0])
Name dataframes dynamically
In a loop for exampe, with globals()
:
ListFiles = ['Stats Name1.xlsx', 'Stats Name2.xlsx'] for f in ListFiles: dfTemp = pd.read_excel(outputDir + f, sheet_name='Sexe', engine='openpyxl') name = f.replace('.xlsx', '').replace('Stats ', '') globals()[name] = dfTemp print('\n' + name + ' :') print(tabulate(globals()[name].head(35), headers='keys', tablefmt='psql', showindex=False))
Generate a unique random ID
import uuid unique_id = str(uuid.uuid4())