1

I have multiple text files but I take 1 as example (name as test.txt):

test.txt

##TSTA##
Record_01_TEST_0001ACA0018
Record_01_TEST_0002BRF:                                       S01TX2OL7L                            
Record_01_TEST_0003CLASTNAME1, FIRSTNAME1 MIDDLENAME1                                                               
Record_01_TEST_0004CADDRESS1                                                               
Record_01_TEST_0005CADDRESS2                                                               
Record_01_TEST_0006D03141979*******M                                                          
Record_01_TEST_0007EINFOE
Record_01_TEST_0008FINFOF_1
Record_01_TEST_0009FINFOF_2
Record_01_TEST_0010FINFOF_3
Record_01_TEST_0011FINFOF_4
Record_01_TEST_0012FINFOF_5
Record_01_TEST_0013FINFOF_6
Record_01_TEST_0014FINFOF_7
Record_01_TEST_0015FINFOF_8
Record_01_TEST_0016J
Record_01_TEST_0017K
Record_01_TEST_0018L
Record_02_TEST_0001ANC0013
Record_02_TEST_0002BRF:                                       S01TX2WWS1                            
Record_02_TEST_0003CLASTNAME2, FIRSTNAME2 MIDDLENAME2                                                               
Record_02_TEST_0004CADDRESS1                                                               
Record_02_TEST_0005CADDRESS2                                                               
Record_02_TEST_0006D12211961       F                                                                
Record_02_TEST_0007EINFOE
Record_02_TEST_0008FINFOF_1
Record_02_TEST_0009GINFOG_1
Record_02_TEST_0010GINFOG_2
Record_02_TEST_0011J
Record_02_TEST_0012K
Record_02_TEST_0013L
Record_03_TEST_0001ANC0021
Record_03_TEST_0002BRF:                                       S01TX2WWZJ                            
Record_03_TEST_0003CLASTNAME3, FIRSTNAME3 MIDDLENAME3                                                               
Record_03_TEST_0004CADDRESS1
Record_03_TEST_0005CADDRESS2
Record_03_TEST_0006CADDRESS3
Record_03_TEST_0007D092019955 **   M*****
Record_03_TEST_0008EINFOE
Record_03_TEST_0009FINFOF_1 
Record_03_TEST_0010FINFOF_2
Record_03_TEST_0011FINFOF_3
Record_03_TEST_0012FINFOF_4
Record_03_TEST_0013FINFOF_5
Record_03_TEST_0014FINFOF_6
Record_03_TEST_0015HINFOH_1
Record_03_TEST_0016HINFOH_2
Record_03_TEST_0017HINFOH_3
Record_03_TEST_0018HINFOH_4
Record_03_TEST_0019J
Record_03_TEST_0020K
Record_03_TEST_0021L
##TEND##

I want to read this fixed width text file and use 20th character to conditionally create DataFrame.

CODE

import pandas as pd
import numpy as np

TYPE___A = {'ST': None, 'COUNT': None}

TYPE___B = {'KEY': None}

TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}

TYPE___D = {'DOB': None, 'GENDER': None}

TYPE___E = {'E_INFO': None}

TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}

TYPE___G = {'G_INFO': None}

TYPE___J = {'J_INFO': None}

TYPE___K = {'K_INFO': None}

TYPE___L = {'L_INFO': None}

rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}

output = pd.DataFrame(columns = rowDict.keys())

with open("test.txt", 'r') as file:
    for line in file:
        if line[:8] == "##TSTA##":
            continue
        elif line[19:20] == "A":
            rowDict['ST']    = line[20:22]
            rowDict['COUNT'] = line[22:26]
        elif line[19:20] == "B":
            rowDict['KEY']  = line[20:80]
        elif line[19:20] == "C":
            for i in range(0,4):
                rowDict[f'C_INFO{i+1}'] = line[20:60]
        elif line[19:20] == "D":
            rowDict['DOB']  = line[20:28]
            rowDict['GENDER']  = line[35:36]
        elif line[19:20] == "E":
            rowDict['E_INFO']  = line[20:39]
        elif line[19:20] == "F":
            for i in range(0,9):
                rowDict[f'F_INFO{i+1}'] = line[20:60]
        elif line[19:20] == "G":
            rowDict['G_INFO']  = line[20:39]
        elif line[19:20] == "J":
            rowDict['J_INFO']  = line[20:39]
        elif line[19:20] == "K":
            rowDict['K_INFO']  = line[20:39]
        elif line[19:20] == "L":
            rowDict['L_INFO']  = line[20:39]        
        elif line[:8] == "##TEND##":
            break
        tmp = pd.DataFrame([rowDict])
        output = pd.concat([output, tmp])

However, I should get 3 rows and 23 columns instead of 52 rows. I think my program read each line and append duplicated.

I changed my program to:

import pandas as pd
import numpy as np

TYPE___A = {'ST': None, 'COUNT': None}

TYPE___B = {'KEY': None}

TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}

TYPE___D = {'DOB': None, 'GENDER': None}

TYPE___E = {'E_INFO': None}

TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}

TYPE___G = {'G_INFO': None}

TYPE___J = {'J_INFO': None}

TYPE___K = {'K_INFO': None}

TYPE___L = {'L_INFO': None}

output = pd.DataFrame()

with open(path + "test.txt", 'r') as file:
    for line in file:
        if line[:8] == "##TSTA##":
            continue
        elif line[19:20] == "A":
            TYPE___A['ST']    = line[20:22]
            TYPE___A['COUNT'] = line[22:26]
        elif line[19:20] == "B":
            TYPE___B['KEY']  = line[20:80]
        elif line[19:20] == "C":
            for i in range(0,4):
                TYPE___C[f'C_INFO{i+1}'] = line[20:60]
        elif line[19:20] == "D":
            TYPE___D['DOB']  = line[20:28]
            TYPE___D['GENDER']  = line[35:36]
        elif line[19:20] == "E":
            TYPE___E['E_INFO']  = line[20:39]
        elif line[19:20] == "F":
            for i in range(0,9):
                TYPE___F[f'F_INFO{i+1}'] = line[20:60]
        elif line[19:20] == "G":
            TYPE___G['G_INFO']  = line[20:39]
        elif line[19:20] == "J":
            TYPE___J['J_INFO']  = line[20:39]
        elif line[19:20] == "K":
            TYPE___K['K_INFO']  = line[20:39]
        elif line[19:20] == "L":
            TYPE___L['L_INFO']  = line[20:39]        
        elif line[:8] == "##TEND##":
            break
        rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
        tmp = pd.DataFrame([rowDict])
        output = pd.concat([output, tmp])

Still wrong!

The logic for program is:

  1. If 20th character is "A", then line[20:22] is column 'ST' and line[22:26] is column 'COUNT'.
  2. If 20th character is "B", then line[20:80] is column 'KEY'.
  3. If 20th character is "C", then line[20:60] is column 'C_INFO{i+1}'. This part should be different with others. The final dataframe has C_INFO1 to C_INFO4, and it meets first "C" then put in C_INFO1, elif meets second "C" then put in C_INFO2, elif meets third "C" then put in C_INFO3, elif meets fourth "C" then put in C_INFO4.
  4. If 20th character is "D", then line[20:28] is column 'DOB' and line[35:36] is column 'GENDER'.
  5. If 20th character is "E", then line[20:39] is column 'E_INFO'.
  6. If 20th character is "F", the logic is similar to "C".
  7. If 20th character is "G", only take last to fill, so in this example, it should only use INFOG_2.
  8. If 20th character is "H", ignore.
  9. "J", "K", "L" are the same as normal logic.

I updated the code and it output 3 rows. However, the C_INFO1 to C_INFO4 and F_INFO1 to F_INFO9 are not parsed correctly. It retrieved the last one.

import pandas as pd
import numpy as np

TYPE___A = {'ST': None, 'COUNT': None}

TYPE___B = {'KEY': None}

TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}

TYPE___D = {'DOB': None, 'GENDER': None}

TYPE___E = {'E_INFO': None}

TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}

TYPE___G = {'G_INFO': None}

TYPE___J = {'J_INFO': None}

TYPE___K = {'K_INFO': None}

TYPE___L = {'L_INFO': None}

rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
output = pd.DataFrame(columns = rowDict.keys())

with open(path + "test.txt", 'r') as file:
    for line in file:
        if line[:8] == "##TSTA##":
            continue
        elif line[19:20] == "A":
            ID_A = line[0:15]
            TYPE___A['ST']    = line[20:22]
            TYPE___A['COUNT'] = line[22:26]
        elif (line[19:20] == "B") & (line[0:15] == ID_A):
            TYPE___B['KEY']  = line[20:80]
        elif (line[19:20] == "C") & (line[0:15] == ID_A):
            for i in range(0,4):
                TYPE___C[f'C_INFO{i+1}'] = line[20:60]
        elif (line[19:20] == "D") & (line[0:15] == ID_A):
            TYPE___D['DOB']  = line[20:28]
            TYPE___D['GENDER']  = line[35:36]
        elif (line[19:20] == "E") & (line[0:15] == ID_A):
            TYPE___E['E_INFO']  = line[20:39]
        elif (line[19:20] == "F") & (line[0:15] == ID_A):
            for i in range(0,9):
                TYPE___F[f'F_INFO{i+1}'] = line[20:60]
        elif (line[19:20] == "G") & (line[0:15] == ID_A):
            TYPE___G['G_INFO']  = line[20:39]
        elif (line[19:20] == "J") & (line[0:15] == ID_A):
            TYPE___J['J_INFO']  = line[20:39]
        elif (line[19:20] == "K") & (line[0:15] == ID_A):
            TYPE___K['K_INFO']  = line[20:39]
        elif (line[19:20] == "L") & (line[0:15] == ID_A):
            TYPE___L['L_INFO']  = line[20:39]
            rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
            tmp = pd.DataFrame([rowDict])
            output = pd.concat([output, tmp])
        elif line[:8] == "##TEND##":
            break

The desired layout should be:enter image description here

Any Idea? Thanks!

2
  • Is it grouped in samples? Ie. sample1 A-L, followed by sample2 A-L etc Commented Aug 24, 2021 at 4:54
  • Yes. But I'm not 100% sure that each record has "L", but I can say that each record has "A" Commented Aug 24, 2021 at 13:25

1 Answer 1

1

Here you go.enter image description here

import pandas as pd
import numpy as np

def define_empty_dict():
    TYPE___A = {'ST': None, 'COUNT': None}

    TYPE___B = {'KEY': None}

    TYPE___C = {'C_INFO1': None, 'C_INFO2': None, 'C_INFO3': None, 'C_INFO4': None}

    TYPE___D = {'DOB': None, 'GENDER': None}

    TYPE___E = {'E_INFO': None}

    TYPE___F = {'F_INFO1': None, 'F_INFO2': None, 'F_INFO3': None, 'F_INFO4': None, 'F_INFO5': None, 'F_INFO6': None, 'F_INFO7': None, 'F_INFO8': None, 'F_INFO9': None}

    TYPE___G = {'G_INFO': None}

    TYPE___J = {'J_INFO': None}

    TYPE___K = {'K_INFO': None}

    TYPE___L = {'L_INFO': None}
    return TYPE___A, TYPE___B, TYPE___C, TYPE___D, TYPE___E,TYPE___F, TYPE___G, TYPE___J, TYPE___K, TYPE___L

TYPE___A, TYPE___B, TYPE___C, TYPE___D, TYPE___E,TYPE___F, TYPE___G, TYPE___J, TYPE___K, TYPE___L = define_empty_dict()
rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
output = pd.DataFrame(columns = rowDict.keys())

##


with open("test.txt", 'r') as file:

    for i, line in enumerate(file):

        if line[:8] == "##TSTA##":
            continue
        elif line[19:20] == "A":
            ID_A = line[0:15]
            TYPE___A['ST']    = line[20:22]
            TYPE___A['COUNT'] = line[22:26]
        elif (line[19:20] == "B") & (line[0:15] == ID_A):
            TYPE___B['KEY']  = line[20:80]
        elif (line[19:20] == "C") & (line[0:15] == ID_A):
            if "number_ref_C" not in globals():
                number_ref_C = int(line[15:19])
                c = 1
                TYPE___C[f'C_INFO{c}'] = line[20:60]
            else :
                c += 1
                TYPE___C[f'C_INFO{c}'] = line[20:60]
        elif (line[19:20] == "D") & (line[0:15] == ID_A):
            TYPE___D['DOB']  = line[20:28]
            TYPE___D['GENDER']  = line[35:36]
        elif (line[19:20] == "E") & (line[0:15] == ID_A):
            TYPE___E['E_INFO']  = line[20:39]
        elif (line[19:20] == "F") & (line[0:15] == ID_A):
            if "number_ref_F" not in globals():
                number_ref_F = int(line[15:19])
                f = 1
                TYPE___F[f'F_INFO{f}'] = line[20:60]
            else :
                f += 1
                TYPE___F[f'F_INFO{f}'] = line[20:60]
        elif (line[19:20] == "G") & (line[0:15] == ID_A):
            TYPE___G['G_INFO']  = line[20:39]
        elif (line[19:20] == "J") & (line[0:15] == ID_A):
            TYPE___J['J_INFO']  = line[20:39]
        elif (line[19:20] == "K") & (line[0:15] == ID_A):
            TYPE___K['K_INFO']  = line[20:39]
        elif (line[19:20] == "L") & (line[0:15] == ID_A):
            TYPE___L['L_INFO']  = line[20:39]
            rowDict = {**TYPE___A, **TYPE___B, **TYPE___C, **TYPE___D, **TYPE___E, **TYPE___F, **TYPE___G, **TYPE___J, **TYPE___K, **TYPE___L}
            tmp = pd.DataFrame([rowDict])
            output = pd.concat([output, tmp])
            TYPE___A, TYPE___B, TYPE___C, TYPE___D, TYPE___E,TYPE___F, TYPE___G, TYPE___J, TYPE___K, TYPE___L = define_empty_dict()
            del number_ref_C, number_ref_F
        elif line[:8] == "##TEND##":
            break

pd.set_option('display.max_columns', None)
output
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.