I am trying to read from table and set the size of each chunk to 256MB without specifying divisions and partition. But when I checking the size of each partition it is less than 60 MB
Minimal complete code for stimulating issue:
Generate Data:
import string
import sqlite3
from itertools import permutations
def create_table(cur):
query = '''create table sample(id int, name text)'''
cur.execute(query)
c = 1
def get_chunk(n, chunk_size):
chunk = []
global c
for d in permutations(string.ascii_uppercase, n):
t = c , ''.join(d)
chunk.append(t)
c += 1
if len(chunk) >= chunk_size:
yield chunk
chunk = []
return chunk
def insert_data(cursor, n, chunk_size):
for chunk in get_chunk(n, chunk_size):
cur.executemany('insert into sample values (?, ?)', chunk)
conn = sqlite3.connect('test.db')
cur = conn.cursor()
create_table(cur)
conn.commit()
insert_data(cur, 6, 1000)
conn.commit()
For processing
import dask.dataframe as dd
df = dd.read_sql_table('sample', 'sqlite:////path/to/test.db', 'id', bytes_per_chunk=268435456)
def fun(df):
print(df.name.count())
print(df.info())
df.map_partitions(fun).compute()