Index de l'article

Advanced write in a TXT file

Split file according to lines numbers

Here below is not right about Pandas, but usefull in some Pandas contexts.

First we count the number of lines in the file (from the recurrence of \n), then we split it in files with 50,000 lines if it exceeds 50,000 lines.

with open(workDirectory+'FixQueries.sql', 'r') as myfile:
data = myfile.read()
taille_fichier = data.count("\n")
 
lines_max = 50000
numero_fichier = 0
if taille_fichier > lines_max:
    print('Attention : le fichier de sortie fait '+str(taille_fichier)+' lignes ! Veuillez patienter...')
    smallfile = None
    with open(workDirectory+'FixQueries.sql') as bigfile:
        for lineno, line in enumerate(bigfile):
            if lineno % lines_max == 0:
                if smallfile:
                    smallfile.close()
                numero_fichier += 1
                small_filename = workDirectory + 'FixQueries {}.sql'.format(numero_fichier)
                smallfile = open(small_filename, "w")
            smallfile.write(line)
        if smallfile:
            smallfile.close()
            print('Nous l\'avons découpé en ', ceil(taille_fichier/lines_max), 'fichiers !\n')
    file1.close()
    os.remove(workDirectory+'FixQueries.sql')

 And then merge the files:

filenames = ['C:/_gh/0/file_25000.txt', 'C:/_gh/0/file_50000.txt', 'C:/_gh/0/file_75000.txt', 'C:/_gh/0/file_100000.txt', 'C:/_gh/0/file_125000.txt']
 
with open('C:/_gh/0/CUMUL1.txt', 'w') as outfile:
    for names in filenames:
        with open(names) as infile:
            outfile.write(infile.read())

 Maybe you will need to list the names of the files before to merge them, with PowerShell (Alt+F+R):

get-childitem | select-object -expandproperty name > _files_list.txt
Attention 👈
Be careful to keep the order of the file, to keep the last line empty.

Replace text in a file

my_file = r'C:/_gh/0/_Urbanisme/test.txt'
 
replacements = [
    ('heros', 'héros'),
    ('Batm?n', 'Batman'),
    ('télévisee', 'télévisée'),
    (' s attaquent', ' s\'attaquent')
]
 
 
with open(my_file, 'r') as file:
    data = file.read()
 
    for a, b in replacements:
        data = data.replace(a, b)
 
with open(my_file, 'w') as file:
    file.write(data)
 
# Confirm
print("OK !")

This is not right related to Pandas, but it is very useful to store some special note to remember during a data process, and automate it.

Create/replace a TXT file and write something into

Use the Write option (w):

MyTxtFile = 'C:/_gh/0/My beautiful file.txt'
 
with open(MyTxtFile, 'w') as file:
    file.write('Blablabla...\n')
    file.write(MyVariable)

Add some lines at the start of an existing file

MyTxtFile = 'C:/_gh/0/My beautiful file.txt'
 
with open(MyTxtFile, 'r') as original: data = original.read()
with open(MyTxtFile, 'w') as modified: modified.write('XPTDR\n' + data)

Add some lines at the end of an existing file

Use the Append option (a):

MyTxtFile = 'C:/_gh/0/My beautiful file.txt'
 
with open(MyTxtFile, 'a') as file:
    file.write('Blablabla...\n')
    file.write(MyVariable)

Replace some lines in an existing file

Here we first Read (r option) and store the content of the file, and then re-create it removing the lines we would delete:

MyTxtFile = 'C:/_gh/0/My beautiful file.txt'
 
with open(MyTxtFile, 'r') as f:
    lines = f.readlines()
 
with open(MyTxtFile, 'w') as f:
    for line in lines:
        if not line.strip('\n').startswith('First line to delete...') :
            if not line.strip('\n').startswith('Second line to delete...') :
                if not line.strip('\n').startswith('Third line to delete...') :
                    f.write(line)
 
with open(MyTxtFile, 'a') as file:
    file.write('New line 1')
    file.write('New line 2')
    file.write('New line 3')

Replace a simple string in a TXT file

MyTxtFile = 'C:/_gh/0/My beautiful file.txt'
 
with open(MyTxtFile, 'r') as file :
    filedata = file.read()
 
filedata = filedata.replace('String 1', 'String 2')
 
with open(MyTxtFile, 'w') as file:
    file.write(filedata)

Replace a string with a regular expression (regex) in a TXT file

FichierTest = 'C:/_gh/0/FichierTest.txt'
 
import re
 
# Find number : r'[0-9]'
# Find line break : r'[\n]'
# Find double line break : r'[\r\n]{2,}'
 
with open(FichierTest, 'r+') as file:
    text = file.read()
    text = re.sub(r'[\r\n]{2,}', '\n\n', text)
    file.seek(0, 0) # seek to beginning
    file.write(text)
    file.truncate() # get rid of any trailing characters

Merge 2 txt files

MyTxtFile1 = 'C:/_gh/0/My beautiful file 1.txt'
MyTxtFile2 = 'C:/_gh/0/My beautiful file 2.txt'
MyTxtFileMerged = 'C:/_gh/0/My beautiful file merged.txt'
 
with open(MyTxtFile1, 'r') as _MyTxtFile1: dataMyTxtFile1 = _MyTxtFile1.read()
with open(MyTxtFile2, 'r') as _MyTxtFile2: dataMyTxtFile2 = _MyTxtFile2.read()
 
with open(MyTxtFileMerged, 'w', encoding='utf-8') as _MyTxtFileMerged:
_MyTxtFileMerged.write(dataMyTxtFile1)
_MyTxtFileMerged.write(dataMyTxtFile2)
 
os.remove(MyTxtFile1)
os.remove(MyTxtFile2)

Test if a comma is used as decimal separator in a TXT file

with open(MyTxtFile, 'r') as myfile:
    data = myfile.read()
    pb_regex = re.findall(r'(?:\t|^)\d+\K,(?=\d+(?:\t|$))', data)
    if pb_regex:
        print(colored('\nAttention, comma is used as a decimal separator in numeric value!\n' +
        'Find them in Notepad with:\n(?:\\t|^)\d+\K,(?=\d+(?:\\t|$))', 'yellow'))
    else:
        pass

Write sentences from dataframe

MyTxtFile = WorkDirectory + ' - My file.txt'
 
with open(MyTxtFile, 'w') as file:
    file.write('\nSpecial cases:\n\n')
 
    Assemblage = list(zip(df['Field1'].tolist(), df['Field2'].tolist(), df['Field3'].tolist(), df['Field4'].tolist()))
    b = []
    for a in Assemblage:
        b.append('- ' + str(list(a)[0]) + ' ' + str(list(a)[1]) + ' (' + str(list(a)[2]) + ') in ' + str(list(a)[3]))
    for i in b:
        file.write(i + '\n')

Convert encoding

With chardet:

from chardet import detect
 
def get_encoding_type(file):
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']
 
from_codec = get_encoding_type(MyOriginalFile)
 
if from_codec != 'utf-8':
    print(colored('Attention, script is ' + from_codec + '!', 'yellow'))
 
    try:
        with open(, 'r', encoding=from_codec) as f, open(MyNewFile, 'w', encoding='utf-8') as e:
            text = f.read()
            e.write(text)
 
        os.remove(MyOriginalFile)
        os.rename(MyNewFile, MyOriginalFile)
 
        print(colored('We converted the file as UTF-8.', 'yellow'))
 
    except UnicodeDecodeError:
        print('Decode Error')
    except UnicodeEncodeError:
        print('Encode Error')

With io:

import io
 
with io.open(OriginalTxtFile, mode='r', encoding='utf8') as fd:
    content = fd.read()
 
with io.open(FinalTxtFile, mode='w', encoding='cp1252') as fd:
    fd.write(content)

Replace a string with a dataframe in an existing txt file

Do a replace with a df.to_string(), maybe with a little regex to replace unwanted spaces.

with open(MtTxtFile, 'r') as file:
    MyTxtData = MyTxtData.replace('STRING TO REPLACE', df.to_string(header=False, index=False, justify='left'))
    MyTxtData = re.sub(r'^ *', '', MyTxtData, flags=re.MULTILINE)
 
with open(MtTxtFile, 'w') as file:
    file.write(MyTxtData)

 

Liens ou pièces jointes
Télécharger ce fichier (France-Departements-Deformation.zip)France-Departements-Deformation.zip[France-Departements-Deformation]335 Ko
Télécharger ce fichier (simple_countries.zip)simple_countries.zip[simple_countries]1880 Ko