I have multiple text files but I take 1 as example (name as test.txt):
test.txt
##TSTA##
Record_01_TEST_0001ACA0018
Record_01_TEST_0002BRF: S01TX2OL7L
Record_01_TEST_0003CLASTNAME1, FIRSTNAME1 MIDDLENAME1
Record_01_TEST_0004CADDRESS1
Record_01_TEST_0005CADDRESS2
Record_01_TEST_0006D03141979*******M
Record_01_TEST_0007EINFOE
Record_01_TEST_0008FINFOF_1
Record_01_TEST_0009FINFOF_2
Record_01_TEST_0010FINFOF_3
Record_01_TEST_0011FINFOF_4
Record_01_TEST_0012FINFOF_5
Record_01_TEST_0013FINFOF_6
Record_01_TEST_0014FINFOF_7
Record_01_TEST_0015FINFOF_8
Record_01_TEST_0016J
Record_01_TEST_0017K
Record_01_TEST_0018L
Record_02_TEST_0001ANC0013
Record_02_TEST_0002BRF: S01TX2WWS1
Record_02_TEST_0003CLASTNAME2, FIRSTNAME2 MIDDLENAME2
Record_02_TEST_0004CADDRESS1
Record_02_TEST_0005CADDRESS2
Record_02_TEST_0006D12211961 F
Record_02_TEST_0007EINFOE
Record_02_TEST_0008FINFOF_1
Record_02_TEST_0009GINFOG_1
Record_02_TEST_0010GINFOG_2
Record_02_TEST_0011J
Record_02_TEST_0012K
Record_02_TEST_0013L
Record_03_TEST_0001ANC0021
Record_03_TEST_0002BRF: S01TX2WWZJ
Record_03_TEST_0003CLASTNAME3, FIRSTNAME3 MIDDLENAME3
Record_03_TEST_0004CADDRESS1
Record_03_TEST_0005CADDRESS2
Record_03_TEST_0006CADDRESS3
Record_03_TEST_0007D092019955 ** M*****
Record_03_TEST_0008EINFOE
Record_03_TEST_0009FINFOF_1
Record_03_TEST_0010FINFOF_2
Record_03_TEST_0011FINFOF_3
Record_03_TEST_0012FINFOF_4
Record_03_TEST_0013FINFOF_5
Record_03_TEST_0014FINFOF_6
Record_03_TEST_0015HINFOH_1
Record_03_TEST_0016HINFOH_2
Record_03_TEST_0017HINFOH_3
Record_03_TEST_0018HINFOH_4
Record_03_TEST_0019J
Record_03_TEST_0020K
Record_03_TEST_0021L
##TEND##
I want to read this fixed width text file and use 20th character to conditionally create DataFrame.
CODE
import pandas as pd
import numpy as np
TYPE___A = {'ST': None, 'COUNT': None}
TYPE___B = {'KEY': None}
TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}
TYPE___D = {'DOB': None, 'GENDER': None}
TYPE___E = {'E_INFO': None}
TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}
TYPE___G = {'G_INFO': None}
TYPE___J = {'J_INFO': None}
TYPE___K = {'K_INFO': None}
TYPE___L = {'L_INFO': None}
rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
output = pd.DataFrame(columns = rowDict.keys())
with open("test.txt", 'r') as file:
for line in file:
if line[:8] == "##TSTA##":
continue
elif line[19:20] == "A":
rowDict['ST'] = line[20:22]
rowDict['COUNT'] = line[22:26]
elif line[19:20] == "B":
rowDict['KEY'] = line[20:80]
elif line[19:20] == "C":
for i in range(0,4):
rowDict[f'C_INFO{i+1}'] = line[20:60]
elif line[19:20] == "D":
rowDict['DOB'] = line[20:28]
rowDict['GENDER'] = line[35:36]
elif line[19:20] == "E":
rowDict['E_INFO'] = line[20:39]
elif line[19:20] == "F":
for i in range(0,9):
rowDict[f'F_INFO{i+1}'] = line[20:60]
elif line[19:20] == "G":
rowDict['G_INFO'] = line[20:39]
elif line[19:20] == "J":
rowDict['J_INFO'] = line[20:39]
elif line[19:20] == "K":
rowDict['K_INFO'] = line[20:39]
elif line[19:20] == "L":
rowDict['L_INFO'] = line[20:39]
elif line[:8] == "##TEND##":
break
tmp = pd.DataFrame([rowDict])
output = pd.concat([output, tmp])
However, I should get 3 rows and 23 columns instead of 52 rows. I think my program read each line and append duplicated.
I changed my program to:
import pandas as pd
import numpy as np
TYPE___A = {'ST': None, 'COUNT': None}
TYPE___B = {'KEY': None}
TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}
TYPE___D = {'DOB': None, 'GENDER': None}
TYPE___E = {'E_INFO': None}
TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}
TYPE___G = {'G_INFO': None}
TYPE___J = {'J_INFO': None}
TYPE___K = {'K_INFO': None}
TYPE___L = {'L_INFO': None}
output = pd.DataFrame()
with open(path + "test.txt", 'r') as file:
for line in file:
if line[:8] == "##TSTA##":
continue
elif line[19:20] == "A":
TYPE___A['ST'] = line[20:22]
TYPE___A['COUNT'] = line[22:26]
elif line[19:20] == "B":
TYPE___B['KEY'] = line[20:80]
elif line[19:20] == "C":
for i in range(0,4):
TYPE___C[f'C_INFO{i+1}'] = line[20:60]
elif line[19:20] == "D":
TYPE___D['DOB'] = line[20:28]
TYPE___D['GENDER'] = line[35:36]
elif line[19:20] == "E":
TYPE___E['E_INFO'] = line[20:39]
elif line[19:20] == "F":
for i in range(0,9):
TYPE___F[f'F_INFO{i+1}'] = line[20:60]
elif line[19:20] == "G":
TYPE___G['G_INFO'] = line[20:39]
elif line[19:20] == "J":
TYPE___J['J_INFO'] = line[20:39]
elif line[19:20] == "K":
TYPE___K['K_INFO'] = line[20:39]
elif line[19:20] == "L":
TYPE___L['L_INFO'] = line[20:39]
elif line[:8] == "##TEND##":
break
rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
tmp = pd.DataFrame([rowDict])
output = pd.concat([output, tmp])
Still wrong!
The logic for program is:
- If 20th character is "A", then line[20:22] is column
'ST'and line[22:26] is column'COUNT'. - If 20th character is "B", then line[20:80] is column
'KEY'. - If 20th character is "C", then line[20:60] is column
'C_INFO{i+1}'. This part should be different with others. The final dataframe hasC_INFO1toC_INFO4, and it meets first "C" then put inC_INFO1, elif meets second "C" then put inC_INFO2, elif meets third "C" then put inC_INFO3, elif meets fourth "C" then put inC_INFO4. - If 20th character is "D", then line[20:28] is column
'DOB'and line[35:36] is column'GENDER'. - If 20th character is "E", then line[20:39] is column
'E_INFO'. - If 20th character is "F", the logic is similar to "C".
- If 20th character is "G", only take last to fill, so in this example, it should only use
INFOG_2. - If 20th character is "H", ignore.
- "J", "K", "L" are the same as normal logic.
I updated the code and it output 3 rows. However, the C_INFO1 to C_INFO4 and F_INFO1 to F_INFO9 are not parsed correctly. It retrieved the last one.
import pandas as pd
import numpy as np
TYPE___A = {'ST': None, 'COUNT': None}
TYPE___B = {'KEY': None}
TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}
TYPE___D = {'DOB': None, 'GENDER': None}
TYPE___E = {'E_INFO': None}
TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}
TYPE___G = {'G_INFO': None}
TYPE___J = {'J_INFO': None}
TYPE___K = {'K_INFO': None}
TYPE___L = {'L_INFO': None}
rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
output = pd.DataFrame(columns = rowDict.keys())
with open(path + "test.txt", 'r') as file:
for line in file:
if line[:8] == "##TSTA##":
continue
elif line[19:20] == "A":
ID_A = line[0:15]
TYPE___A['ST'] = line[20:22]
TYPE___A['COUNT'] = line[22:26]
elif (line[19:20] == "B") & (line[0:15] == ID_A):
TYPE___B['KEY'] = line[20:80]
elif (line[19:20] == "C") & (line[0:15] == ID_A):
for i in range(0,4):
TYPE___C[f'C_INFO{i+1}'] = line[20:60]
elif (line[19:20] == "D") & (line[0:15] == ID_A):
TYPE___D['DOB'] = line[20:28]
TYPE___D['GENDER'] = line[35:36]
elif (line[19:20] == "E") & (line[0:15] == ID_A):
TYPE___E['E_INFO'] = line[20:39]
elif (line[19:20] == "F") & (line[0:15] == ID_A):
for i in range(0,9):
TYPE___F[f'F_INFO{i+1}'] = line[20:60]
elif (line[19:20] == "G") & (line[0:15] == ID_A):
TYPE___G['G_INFO'] = line[20:39]
elif (line[19:20] == "J") & (line[0:15] == ID_A):
TYPE___J['J_INFO'] = line[20:39]
elif (line[19:20] == "K") & (line[0:15] == ID_A):
TYPE___K['K_INFO'] = line[20:39]
elif (line[19:20] == "L") & (line[0:15] == ID_A):
TYPE___L['L_INFO'] = line[20:39]
rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
tmp = pd.DataFrame([rowDict])
output = pd.concat([output, tmp])
elif line[:8] == "##TEND##":
break
Any Idea? Thanks!

