Since the tradeoff between speed and memmory-ussage I think your method is well situable. But you can still make it faster:
- avoid flattening the arrays inside the loop this will save you some time
- instead of using
.flat[:] or .flatten() use .ravel() (I'm not sure why but it seems to be faster)
- also avoid
for i in range.. just zip in the values of interest (see method3)
Here a good solution that will speed-up things:
r_i_f = row_indices.ravel()
c_i_f = col_indices.ravel()
v_f = values.ravel()
indexed_sum = np.zeros((row_indices.max()+1,col_indices.max()+1))
for i,j,v in zip(r_i_f,c_i_f,v_f):
indexed_sum[i, j] += v
return indexed_sum
To see a comparision here's some toy code (correct any detail it's not proportioned and let me know if it works well for you)
def method1(values,row_indices,col_indices):
"""OP method"""
indexed_sum = np.zeros((row_indices.max()+1,col_indices.max()+1))
for i in range(row_indices.size):
indexed_sum[row_indices.flat[i], col_indices.flat[i]] += values.flat[i]
return indexed_sum
def method2(values,row_indices,col_indices):
"""just raveling before loop. Time saved here is considerable"""
r_i_f = row_indices.ravel()
c_i_f = col_indices.ravel()
v_f = values.ravel()
indexed_sum = np.zeros((row_indices.max()+1,col_indices.max()+1))
for i in range(row_indices.size):
indexed_sum[r_i_f[i], c_i_f[i]] += v_f[i]
return indexed_sum
def method3(values,row_indices,col_indices):
"""raveling, then avoiding range(...), just zipping
the time saved here is small but by no cost"""
r_i_f = row_indices.ravel()
c_i_f = col_indices.ravel()
v_f = values.ravel()
indexed_sum = np.zeros((row_indices.max()+1,col_indices.max()+1))
for i,j,v in zip(r_i_f,c_i_f,v_f):
indexed_sum[i, j] += v
return indexed_sum
from time import perf_counter
import numpy as np
out_size = 50
in_shape = (5000,5000)
values = np.random.randint(10,size=in_shape)
row_indices = np.random.randint(out_size,size=in_shape)
col_indices = np.random.randint(out_size,size=in_shape)
t1 = perf_counter()
v1 = method1(values,row_indices,col_indices)
t2 = perf_counter()
v2 = method2(values,row_indices,col_indices)
t3 = perf_counter()
v3 = method3(values,row_indices,col_indices)
t4 = perf_counter()
print(f"method1: {t2-t1}")
print(f"method2: {t3-t2}")
print(f"method3: {t4-t3}")
Outputs for values of shape 5000x5000 and output shaped as 50x50:
method1: 23.66934896100429
method2: 14.241692076990148
method3: 11.415708078013267
aditional a comparison between fltten methods (in my computer)
q = np.random.randn(5000,5000)
t1 = perf_counter()
q1 = q.flatten()
t2 = perf_counter()
q2 = q.ravel()
t3 = perf_counter()
q3 = q.reshape(-1)
t4 = perf_counter()
q4 = q.flat[:]
t5 = perf_counter()
#print times:
print(f"q.flatten: {t2-t1}")
print(f"q.ravel: {t3-t2}")
print(f"q.reshape(-1): {t4-t3}")
print(f"q.flat[:]: {t5-t4}")
Outputs:
q.flatten: 0.043878231997950934
q.ravel: 5.550700007006526e-05
q.reshape(-1): 0.0006349250033963472
q.flat[:]: 0.08832104799512308