Index de l'article

Read/export data file (CSV, Excel, MySQL, Postgres, JSON, Shape ...)

Read and display an Excel file

import pandas as pd
from tabulate import tabulate
 
df = pd.read_excel('C:/Users/Your file.xlsx',
                   sheet_name='Export', engine='openpyxl', usecols=['My first column', 'My second column'])
 
print(tabulate(df.head(35), headers='keys', tablefmt='psql', showindex=True))

I set engine='openpyxl' because it is from an XLSX file. sheet_name specifies the sheet to import, specify your columns with usecols.

Then I use tabulate to display properly my data in a Postgres way (tablefmt='psql').

With head(35) I just display the 35 first lines, but the dataframe here named df contains all the data.

If you wan to re-organize the columns, please see the Miscellaneous chapter.

Read a CSV file

import pandas as pd
from tabulate import tabulate
 
df_subscriber = pd.read_csv(workDirectory+'joo_acymailing_subscriber.csv', sep=';', usecols=['subid', 'source', 'email'])
 
print(tab(df_subscriber.head(10), headers='keys', tablefmt='psql', showindex=False))

Add textual column names from a CSV without column names

Use names=[] :

import pandas as pd
from tabulate import tabulate
 
df = pd.read_csv(inpuMyFilePath, names=['Mon Champ 1', 'Mon Champ 2', 'Mon Champ 3'], sep=',', keep_default_na=False).astype(str)
 
print(tabulate(df.head(10), headers='keys', tablefmt='psql', showindex=False))

Read a non-standard Excel file

Sometimes a file has wrong top lines, or you can not trust the future column names.

To read a file from a specific row for example, skipping the last rows.

import pandas as pd
from tabulate import tabulate
 
df = pd.read_excel(r'C:/Users/Georges/Downloads/test excel.xlsx',
sheet_name='Sheet1', engine='openpyxl', usecols=[0,1], header=5-1, skipfooter=1, names=['id', 'value'])
 
print(tabulate(df.head(35), headers='keys', tablefmt='psql', showindex=False))

Or skip a specific line (with skiprows=[...]):

...
df = pd.read_excel(my_file, sheet_name='Feuil1', engine='openpyxl', skiprows=[2])
...

Use first row as label column

my_labels = df.iloc[0]
df = df[1:]
df.columns = my_labels

Read from MySQL with sqlalchemy

sqlalchemy is the best way with pandas:

from tabulate import tabulate as tab
import pandas as pd
from sqlalchemy import create_engine
 
username = 'root'
password = ''
port = 3308
database = 'my_beautiful_db'
host ='localhost'
 
engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s' % (username, password, host, port, database))
sql = 'SELECT COUNT(*) AS total FROM communes ;'
df = pd.read_sql_query(sql, engine).set_index('total')
 
print(tab(df.head(5), headers='keys', tablefmt='psql'))
 
engine.dispose()

Specify database encoding

engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s?charset=utf8mb4' % (username, password, host, port, database))

Execute a SQL queries from a file with sqlalchemy

from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
 
my_engine = create_engine('mysql+mysqldb://%s:%s@%s:%i/%s?charset=utf8mb4' % (my_username, my_password, my_host, my_port, my_database))
 
my_sql_file = 'C:/MyQueries.sql'
 
# LOAD THE SQL SCRIPT
with open(queriesDirectory + my_sql_file, 'r') as sql_file:
    sql_script = sql_file.read()
 
# CREATE SESSION
my_session = sessionmaker(bind=my_engine)
session = my_session()
 
# START A TRANSACTION
session.begin()
 
try:
    session.execute(text(sql_script))
    print('Execution of "' + my_sql_file + '" finished!')
except:
    print('We can not execute "' + my_sql_file + '"!')
    pass
 
# VALIDATE THE TRANSACTION
session.commit()
 
# CLOSE SESSION AND CONNECTION
session.close()
my_engine.dispose()

Read from MySQL with mysqlclient

import MySQLdb
from tabulate import tabulate
import pandas as pd
 
mydb = MySQLdb.connect(user='root', password='', database='MyDb', host='localhost', port=3306)
query = 'SELECT name FROM users ;'
 
df = pd.read_sql(query, mydb)
mydb.close()
 
print(tabulate(df.head(10), headers='keys', tablefmt='psql'))
number = df.shape[0]
print(number)

Execute a SQL queries from a file with mysqlclient

import MySQLdb
from termcolor import colored
 
conn = MySQLdb.connect(host='localhost',user='root',password='',database='ma_base',port=3308)
 
my_sql_file = 'C:/MyQueries.sql'
 
with open(sqlQueries, 'r', encoding='utf8') as sql_file:
    sql_script = str(my_sql_file.read())
 
try:
    cursor = conn.cursor()
    cursor.execute(sql_script)
    cursor.close()
    print(colored('\nOK!', 'green'))
except:
    print(colored('\nIssue!', 'red'))
    pass

Read from MySQL with mysql.connector

import mysql.connector as connection
import pandas as pd
from tabulate import tabulate as tab
 
mydb = connection.connect(host='XXX.XX.XXX.XX', port=3308, database='MyDb',user='Me', passwd='pwdXXX!456', use_pure=True)
 
try:
    query = "SELECT firstname, lastname FROM customer ;"
    df = pd.read_sql(query, mydb)
except Exception as e:
    mydb.close()
    print(str(e))
 
print(tab(df.head(5), headers='keys', tablefmt='psql'))

Read from Postgres

import pandas as pd
from tabulate import tabulate
import psycopg2
 
con = psycopg2.connect(database='mytransformations', user='postgres', password='', host='localhost', port='5432')
print("OK, we are connect to mytransformations")
 
# DISPLAY WITH PANDAS
df = pd.read_sql("SELECT * FROM myparcels ;", con)
con.close()
 
df.drop('geom', axis=1, inplace=True)
 
print('\nResults:')
print(tabulate(df.head(10), headers='keys', tablefmt='psql', showindex=False))
print('Number of records:', df.shape[0])

Read from a JSON

import pandas as pd
from tabulate import tabulate
import json
import requests
 
response = requests.get(MON_URL)
my_file = json.loads(response.text)
my_json = my_file['data']
 
mon_df = pd.DataFrame.from_dict(my_json)

Read from a shape

Use geopandas:

import geopandas as gpd
from tabulate import tabulate
 
myShape = 'C:\\Path\\Of\\My\\Shape.shp'
 
print('\n' + myShape)
 
df = gpd.read_file(myShape)
df['type'] = df['geometry'].astype(str).str.replace(r' .*|\(.*', '', regex=True)
df = df[['id', 'type', 'geometry']]
 
print(tabulate(df.head(10), headers='keys', tablefmt='psql', showindex=True))
print(df.shape[0])
 
MyFieldList = df['id'].drop_duplicates().dropna().sort_values(ascending=False).tolist()
print('\n' + str(MyFieldList))
 
MyGeomTypeList = df['type'].drop_duplicates().dropna().sort_values(ascending=False).tolist()
print('\n' + str(MyGeomTypeList))

Export a data-frame in a new Excel file

df.to_excel("C:/Users/Your new file.xlsx", index=False, sheet_name='Tab 1')

index=False to not get the index column.

Export a data-frame in an existing Excel sheet

Use mode='a' and if_sheet_exists:

from openpyxl import load_workbook
import openpyxl
 
My_File_Path= 'C:/My/File.xlsx'
 
with pd.ExcelWriter(My_File_Path, 'openpyxl', mode='a', if_sheet_exists='replace') as writing:
    df.to_excel(writing, 'My sheet', index=False)

Export several dataframes in Excel sheets

If you have several data-frame to export in one Excel file, each in unique sheets, you should open an ExcelWriter. And close it.

writer = pd.ExcelWriter('C:/Users/Your new file.xlsx', engine='xlsxwriter')
 
df_Country.to_excel(writer, index=False, sheet_name='Country', header=['Country', 'Total'])
df_Emails.to_excel(writer, index=False, sheet_name='Emails', header=['Emails', 'Total'])
 
writer.save()

Export several dataframes in the same sheet

df.to_excel(writer, index=False, sheet_name='Registrations', startcol=3, startrow=34, header=['Filed', 'Total', '%'])

Export a dataframe as a txt file, with separator and ANSI encoding

df.to_csv(my_file_import, header=None, index=None, sep='\t', encoding='mbcs')

Replace a string with a dataframe in an existing txt file

Do a replace with a df.to_string(), maybe with a little regex to replace unwanted spaces.

with open(MtTxtFile, 'r') as file:
    MyTxtData = MyTxtData.replace('STRING TO REPLACE', df.to_string(header=False, index=False, justify='left'))
    MyTxtData = re.sub(r'^ *', '', MyTxtData, flags=re.MULTILINE)
 
with open(MtTxtFile, 'w') as file:
    file.write(MyTxtData)

Add filters in all columns in one sheet in an Excel file

import openpyxl
from openpyxl.utils import get_column_letter
 
outputExcelFile = r'C:/User/Your file.xlsx'
workbook = openpyxl.load_workbook(outputExcelFile)
 
worksheet = workbook['Sheet name']
FullRange = 'A1:' + get_column_letter(worksheet.max_column) + str(worksheet.max_row)
worksheet.auto_filter.ref = FullRange
 
workbook.save(outputExcelFile)

Add filters in all columns in all sheets in an Excel file

import openpyxl
from openpyxl.utils import get_column_letter
 
outputExcelFile = r'C:/User/Your file.xlsx'
 
workbook = openpyxl.load_workbook(outputExcelFile)
sheetsLits = workbook.sheetnames
 
for sheet in sheetsLits:
    worksheet = workbook[sheet]
    FullRange = 'A1:' + get_column_letter(worksheet.max_column) + str(worksheet.max_row)
    worksheet.auto_filter.ref = FullRange
    workbook.save(outputExcelFile)

Add colors in all column names in a sheet in an Excel file

import openpyxl
from openpyxl.styles import PatternFill
 
workbook = openpyxl.load_workbook(My_File_Path)
worksheet = workbook['My sheet']
 
for cell in worksheet[1]:
    worksheet[cell.coordinate].fill = PatternFill(fgColor='FFC6C1C1', fill_type='solid')
    workbook.save(My_File_Path)

Add colors in all column names in all sheets in an Excel file

import openpyxl
from openpyxl.styles import PatternFill
 
outputExcelFile = r'C:/Users/Your File.xlsx'
 
workbook = openpyxl.load_workbook(outputExcelFile)
sheetsLits = workbook.sheetnames
 
for sheet in sheetsLits:
    worksheet = workbook[sheet]
    for cell in workbook[sheet][1]:
        worksheet[cell.coordinate].fill = PatternFill(fgColor = 'FFC6C1C1', fill_type = 'solid')
        workbook.save(outputExcelFile)

Modify a column width in one sheet in an Excel file

import openpyxl
 
outputExcelFile = r'C:/Users/Your File.xlsx'
 
workbook = openpyxl.load_workbook(outputExcelFile)
sheetsLits = workbook.sheetnames
 
workbook['Your sheet'].column_dimensions['A'].width = 75
workbook.save(outputExcelFile)

Modify all columns width in all sheets in an Excel file

The code below set the A column size to 30, and the others to 10.

import openpyxl
from openpyxl.utils import get_column_letter
 
outputExcelFile = r'C:/Users/Your File.xlsx'
 
workbook = openpyxl.load_workbook(outputExcelFile)
sheetsLits = workbook.sheetnames
 
for sheet in sheetsLits:
    for cell in workbook[sheet][1]:
        if get_column_letter(cell.column) == 'A':
            workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 30
        else:
            workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 10
 
        workbook.save(outputExcelFile)

Add color to some line recovering their indexes from condition

list_duplicate = df.index[df['Ligne en doublon'] == 'DOUBLON !'].tolist()
 
for index in list_duplicate:
    for cell in worksheet[index+2]:
        worksheet[cell.coordinate].fill = PatternFill(fgColor='FF6666', fill_type='solid')
        workbook.save(My_File_Path)

Freeze the top row of an Excel file

worksheet.freeze_panes = 'A2'

Freeze the top row and the first column of an Excel file

worksheet.freeze_panes = 'B2'

Freeze the top row and the 2 first columns of an Excel file

worksheet.freeze_panes = 'C2'

Change font color and style in an Excel file

from openpyxl.styles import Font
 
worksheet['A1'].font = Font(color='FFFFFF', italic=False, bold=True)

Work on several Excel files

Here we freeze the top row, add filters, color and change the width of all columns from 3 Excel files. 

import openpyxl
from openpyxl.utils import get_column_letter
from openpyxl.styles import PatternFill
 
 
File1 = r'D:/Work/My file 1.xlsx'
File2 = r'D:/Work/My file 2.xlsx'
File3 = r'D:/Work/My file 3.xlsx'
 
ExcelList = [File1, File2, File3]
 
for ExcelFile in ExcelList:
    workbook = openpyxl.load_workbook(ExcelFile)
    sheetsLits = workbook.sheetnames
 
    for sheet in sheetsLits:
        worksheet = workbook[sheet]
        FullRange = 'A1:' + get_column_letter(worksheet.max_column) + str(worksheet.max_row)
        worksheet.auto_filter.ref = FullRange
        worksheet.freeze_panes = 'A2'
        for cell in workbook[sheet][1]:
            worksheet[cell.coordinate].fill = PatternFill(fgColor='FFC6C1C1', fill_type='solid')
            if get_column_letter(cell.column) == 'A':
                workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 12
            else:
                workbook[sheet].column_dimensions[get_column_letter(cell.column)].width = 20
            workbook.save(ExcelFile)

Insert a picture in an Excel file

And delete them after.

img = openpyxl.drawing.image.Image('C:/Users/myplot.png')
img.anchor = 'E6'
 
workbook['Your sheet'].add_image(img)
workbook.save(outputExcelFile)
 
os.remove('C:/Users/myplot.png')

Open a box to search a file and get his path

Use Tkinter.

from tkinter import Tk
from tkinter.filedialog import askopenfilename
 
Tk().withdraw()
filename = askopenfilename()
print(filename)

Open a box to save an Excel file

from tkinter import Tk
from tkinter.filedialog import asksaveasfilename
 
fileName = asksaveasfilename(filetypes=(("Excel files", "*.xlsx"), ("All files", "*.*")), defaultextension='.xslx')
df.to_excel(fileName, index=False, sheet_name='Tab 1', header=None)

Check if a file exists

And use it only if it exists:

import os
 
my_file = 'C:/_gh/0/export.xlsx'
if os.path.isfile(my_file) == True:
    df_ExportFeuil1 = pd.read_excel(my_file, sheet_name='Feuil1', engine='openpyxl')

Rename a file

import os
os.rename('My file.xlsx', 'My file renamed.xlsx')

Search duplicate columns in Excel

from openpyxl import load_workbook
 
wb = load_workbook(filename=my_file)
 
Feuil1 = wb['Feuil1']
 
# Récupérer les noms de colonnes dans la 3ème ligne
noms_colonnes = [colonne.value for colonne in Feuil1[3]]
print(noms_colonnes)
 
# Vérifier les doublons dans les noms de colonnes
doublons = set()
noms_en_double = set()
for nom_colonne in noms_colonnes:
    if nom_colonne in doublons:
        noms_en_double.add(nom_colonne)
    else:
        doublons.add(nom_colonne)
 
# Afficher les noms de colonnes en doublons
if noms_en_double:
    print(colored('Attention colonne en doublon dans le fichier Excel :', 'red'))
    print(colored(str(noms_en_double).replace('{', '').replace('}', ''), 'red'))
    sys.exit()
 
else:
    print(colored('OK, pas de colonne en doublon dans le fichier Excel.', 'green'))

 

Liens ou pièces jointes
Télécharger ce fichier (France-Departements-Deformation.zip)France-Departements-Deformation.zip[France-Departements-Deformation]335 Ko
Télécharger ce fichier (simple_countries.zip)simple_countries.zip[simple_countries]1880 Ko