Help making better use of numpy array functions

Hi I have written some code and I would appreciate any suggestions to make better use of the numpy arrays functions to make it a bit more efficient and less of a port from C. Any tricks are thoughts would be much appreciated. The code reads in a series of images, collects a running total if the value is valid and averages everything every 8 images. Code below... Thanks in advance... #!/usr/bin/env python """ Average the daily LST values to make an 8 day product... Martin De Kauwe 18th November 2009 """ import sys, os, glob import numpy as np import matplotlib.pyplot as plt def averageEightDays(files, lst, count, numrows, numcols): day_count = 0 # starting day - tag for output file doy = 122 # loop over all the images and average all the information # every 8 days... for fname in glob.glob(os.path.join(path, files)): current_file = fname.split('/')[8] year = current_file.split('_')[2][0:4] try: f = open(fname, 'rb') except IOError: print "Cannot open outfile for read", fname sys.exit(1) data = np.fromfile(f, dtype=np.float32).reshape(numrows, numcols) f.close() # if it is day 1 then we need to fill up the holding array... # copy the first file into the array... if day_count == 0: lst = data for i in xrange(numrows): for j in xrange(numcols): # increase the pixel count if we got vaild data (undefined = -999.0 if lst[i,j] > -900.: count[i,j] = 1 day_count += 1 # keep a running total of valid lst values in an 8 day sequence elif day_count > 0 and day_count <= 7: for i in xrange(numrows): for j in xrange(numcols): # lst valid pixel? if data[i,j] > -900.: # was the existing pixel valid? if lst[i,j] < -900.: lst[i,j] = data[i,j] count[i,j] = 1 else: lst[i,j] += data[i,j] count[i,j] += 1 day_count += 1 # need to average previous 8 days and write to a file... if day_count == 8: for i in xrange(numrows): for j in xrange(numcols): if count[i,j] > 0: lst[i,j] = lst[i,j] / count[i,j] else: lst[i,j] = -999.0 day_count = 0 doy += 8 lst, count = write_outputs(lst, count, year, doy) # it is possible we didn't have enough slices to do the last 8day avg... # but if we have more than one day we shall use it # need to average previous 8 days and write to a file... if day_count > 1: for i in xrange(numrows): for j in xrange(numcols): if count[i,j] > 0: lst[i,j] = lst[i,j] / count[i,j] else: lst[i,j] = -999.0 day_count = 0 doy += 8 lst, count = write_outputs(lst, count, year, doy) def write_outputs(lst, count, year, doy): path = "/users/eow/mgdk/research/HOFF_plots/LST/8dayLST" outfile = "lst_8day1030am_" + str(year) + str(doy) + ".gra" try: of = open(os.path.join(path, outfile), 'wb') except IOError: print "Cannot open outfile for write", outfile sys.exit(1) outfile2 = "pixelcount_8day1030am_" + str(year) + str(doy) + ".gra" try: of2 = open(os.path.join(path, outfile2), 'wb') except IOError: print "Cannot open outfile for write", outfile2 sys.exit(1) # empty stuff and zero 8day counts lst.tofile(of) count.tofile(of2) of.close() of2.close() lst = 0.0 count = 0.0 return lst, count if __name__ == "__main__": numrows = 332 numcols = 667 path = "/users/eow/mgdk/research/HOFF_plots/LST/gridded_03/" lst = np.zeros((numrows, numcols), dtype=np.float32) count = np.zeros((numrows, numcols), dtype=np.int) averageEightDays('lst_scr_2006*.gra', lst, count, numrows, numcols) lst = 0.0 count = 0.0 averageEightDays('lst_scr_2007*.gra', lst, count, numrows, numcols) -- View this message in context: http://old.nabble.com/Help-making-better-use-of-numpy-array-functions-tp2650... Sent from the Numpy-discussion mailing list archive at Nabble.com.

I tried redoing the internal logic for example by using the where function but I can't seem to work out how to match up the logic. For example (note slightly different from above). If I change the main loop to lst = np.where((data > -900.0) & (lst < -900.0), data, lst) lst = np.where((data > -900.0) & (lst > -900.0), 5.0, lst) If I then had a data array value of 10.0 and lst array value of -999.0. In this scenario the first statement would set the LST value to 10.0. However when you run the second statement, data is still bigger than the undefined (-999.0) but now so is the LST, hence the lst is set to 5.0 This doesn't match with the logic of if data[i,j] > -900.: if lst[i,j] < -900.: lst[i,j] = data[i,j] elif lst[i,j] > -900.: lst[i,j] = 5.0 as it would never get to the second branch under this logic. Does anyone have any suggestions on how to match up the logic? mdekauwe wrote:
Hi I have written some code and I would appreciate any suggestions to make better use of the numpy arrays functions to make it a bit more efficient and less of a port from C. Any tricks are thoughts would be much appreciated.
The code reads in a series of images, collects a running total if the value is valid and averages everything every 8 images.
Code below...
Thanks in advance...
#!/usr/bin/env python
""" Average the daily LST values to make an 8 day product...
Martin De Kauwe 18th November 2009 """ import sys, os, glob import numpy as np import matplotlib.pyplot as plt
def averageEightDays(files, lst, count, numrows, numcols):
day_count = 0 # starting day - tag for output file doy = 122
# loop over all the images and average all the information # every 8 days... for fname in glob.glob(os.path.join(path, files)): current_file = fname.split('/')[8] year = current_file.split('_')[2][0:4]
try: f = open(fname, 'rb') except IOError: print "Cannot open outfile for read", fname sys.exit(1)
data = np.fromfile(f, dtype=np.float32).reshape(numrows, numcols) f.close()
# if it is day 1 then we need to fill up the holding array... # copy the first file into the array... if day_count == 0: lst = data for i in xrange(numrows): for j in xrange(numcols): # increase the pixel count if we got vaild data (undefined = -999.0 if lst[i,j] > -900.: count[i,j] = 1 day_count += 1
# keep a running total of valid lst values in an 8 day sequence elif day_count > 0 and day_count <= 7: for i in xrange(numrows): for j in xrange(numcols): # lst valid pixel? if data[i,j] > -900.: # was the existing pixel valid? if lst[i,j] < -900.: lst[i,j] = data[i,j] count[i,j] = 1 else: lst[i,j] += data[i,j] count[i,j] += 1 day_count += 1
# need to average previous 8 days and write to a file... if day_count == 8: for i in xrange(numrows): for j in xrange(numcols): if count[i,j] > 0: lst[i,j] = lst[i,j] / count[i,j] else: lst[i,j] = -999.0
day_count = 0 doy += 8
lst, count = write_outputs(lst, count, year, doy)
# it is possible we didn't have enough slices to do the last 8day avg... # but if we have more than one day we shall use it # need to average previous 8 days and write to a file... if day_count > 1: for i in xrange(numrows): for j in xrange(numcols): if count[i,j] > 0: lst[i,j] = lst[i,j] / count[i,j] else: lst[i,j] = -999.0
day_count = 0 doy += 8
lst, count = write_outputs(lst, count, year, doy)
def write_outputs(lst, count, year, doy): path = "/users/eow/mgdk/research/HOFF_plots/LST/8dayLST" outfile = "lst_8day1030am_" + str(year) + str(doy) + ".gra"
try: of = open(os.path.join(path, outfile), 'wb') except IOError: print "Cannot open outfile for write", outfile sys.exit(1)
outfile2 = "pixelcount_8day1030am_" + str(year) + str(doy) + ".gra" try: of2 = open(os.path.join(path, outfile2), 'wb') except IOError: print "Cannot open outfile for write", outfile2 sys.exit(1)
# empty stuff and zero 8day counts lst.tofile(of) count.tofile(of2) of.close() of2.close() lst = 0.0 count = 0.0
return lst, count
if __name__ == "__main__":
numrows = 332 numcols = 667
path = "/users/eow/mgdk/research/HOFF_plots/LST/gridded_03/" lst = np.zeros((numrows, numcols), dtype=np.float32) count = np.zeros((numrows, numcols), dtype=np.int) averageEightDays('lst_scr_2006*.gra', lst, count, numrows, numcols)
lst = 0.0 count = 0.0 averageEightDays('lst_scr_2007*.gra', lst, count, numrows, numcols)
-- View this message in context: http://old.nabble.com/Help-making-better-use-of-numpy-array-functions-tp2650... Sent from the Numpy-discussion mailing list archive at Nabble.com.

On Wed, Nov 25, 2009 at 4:13 PM, mdekauwe <mdekauwe@gmail.com> wrote:
I tried redoing the internal logic for example by using the where function but I can't seem to work out how to match up the logic. For example (note slightly different from above). If I change the main loop to
lst = np.where((data > -900.0) & (lst < -900.0), data, lst) lst = np.where((data > -900.0) & (lst > -900.0), 5.0, lst)
does an intermediate array help? lst2 = np.where((data > -900.0) & (lst < -900.0), data, lst) lst = np.where((data > -900.0) & (lst > -900.0), 5.0, lst2) I didn't read through all the loops to see what would be a better way to do it. Josef
If I then had a data array value of 10.0 and lst array value of -999.0. In this scenario the first statement would set the LST value to 10.0. However when you run the second statement, data is still bigger than the undefined (-999.0) but now so is the LST, hence the lst is set to 5.0
This doesn't match with the logic of
if data[i,j] > -900.: if lst[i,j] < -900.: lst[i,j] = data[i,j] elif lst[i,j] > -900.: lst[i,j] = 5.0
as it would never get to the second branch under this logic.
Does anyone have any suggestions on how to match up the logic?
mdekauwe wrote:
Hi I have written some code and I would appreciate any suggestions to make better use of the numpy arrays functions to make it a bit more efficient and less of a port from C. Any tricks are thoughts would be much appreciated.
The code reads in a series of images, collects a running total if the value is valid and averages everything every 8 images.
Code below...
Thanks in advance...
#!/usr/bin/env python
""" Average the daily LST values to make an 8 day product...
Martin De Kauwe 18th November 2009 """ import sys, os, glob import numpy as np import matplotlib.pyplot as plt
def averageEightDays(files, lst, count, numrows, numcols):
day_count = 0 # starting day - tag for output file doy = 122
# loop over all the images and average all the information # every 8 days... for fname in glob.glob(os.path.join(path, files)): current_file = fname.split('/')[8] year = current_file.split('_')[2][0:4]
try: f = open(fname, 'rb') except IOError: print "Cannot open outfile for read", fname sys.exit(1)
data = np.fromfile(f, dtype=np.float32).reshape(numrows, numcols) f.close()
# if it is day 1 then we need to fill up the holding array... # copy the first file into the array... if day_count == 0: lst = data for i in xrange(numrows): for j in xrange(numcols): # increase the pixel count if we got vaild data (undefined = -999.0 if lst[i,j] > -900.: count[i,j] = 1 day_count += 1
# keep a running total of valid lst values in an 8 day sequence elif day_count > 0 and day_count <= 7: for i in xrange(numrows): for j in xrange(numcols): # lst valid pixel? if data[i,j] > -900.: # was the existing pixel valid? if lst[i,j] < -900.: lst[i,j] = data[i,j] count[i,j] = 1 else: lst[i,j] += data[i,j] count[i,j] += 1 day_count += 1
# need to average previous 8 days and write to a file... if day_count == 8: for i in xrange(numrows): for j in xrange(numcols): if count[i,j] > 0: lst[i,j] = lst[i,j] / count[i,j] else: lst[i,j] = -999.0
day_count = 0 doy += 8
lst, count = write_outputs(lst, count, year, doy)
# it is possible we didn't have enough slices to do the last 8day avg... # but if we have more than one day we shall use it # need to average previous 8 days and write to a file... if day_count > 1: for i in xrange(numrows): for j in xrange(numcols): if count[i,j] > 0: lst[i,j] = lst[i,j] / count[i,j] else: lst[i,j] = -999.0
day_count = 0 doy += 8
lst, count = write_outputs(lst, count, year, doy)
def write_outputs(lst, count, year, doy): path = "/users/eow/mgdk/research/HOFF_plots/LST/8dayLST" outfile = "lst_8day1030am_" + str(year) + str(doy) + ".gra"
try: of = open(os.path.join(path, outfile), 'wb') except IOError: print "Cannot open outfile for write", outfile sys.exit(1)
outfile2 = "pixelcount_8day1030am_" + str(year) + str(doy) + ".gra" try: of2 = open(os.path.join(path, outfile2), 'wb') except IOError: print "Cannot open outfile for write", outfile2 sys.exit(1)
# empty stuff and zero 8day counts lst.tofile(of) count.tofile(of2) of.close() of2.close() lst = 0.0 count = 0.0
return lst, count
if __name__ == "__main__":
numrows = 332 numcols = 667
path = "/users/eow/mgdk/research/HOFF_plots/LST/gridded_03/" lst = np.zeros((numrows, numcols), dtype=np.float32) count = np.zeros((numrows, numcols), dtype=np.int) averageEightDays('lst_scr_2006*.gra', lst, count, numrows, numcols)
lst = 0.0 count = 0.0 averageEightDays('lst_scr_2007*.gra', lst, count, numrows, numcols)
-- View this message in context: http://old.nabble.com/Help-making-better-use-of-numpy-array-functions-tp2650... Sent from the Numpy-discussion mailing list archive at Nabble.com.
_______________________________________________ NumPy-Discussion mailing list NumPy-Discussion@scipy.org http://mail.scipy.org/mailman/listinfo/numpy-discussion

On Nov 25, 2009, at 4:13 PM, mdekauwe wrote:
I tried redoing the internal logic for example by using the where function but I can't seem to work out how to match up the logic. For example (note slightly different from above). If I change the main loop to
lst = np.where((data > -900.0) & (lst < -900.0), data, lst) lst = np.where((data > -900.0) & (lst > -900.0), 5.0, lst)
If I then had a data array value of 10.0 and lst array value of -999.0. In this scenario the first statement would set the LST value to 10.0. However when you run the second statement, data is still bigger than the undefined (-999.0) but now so is the LST, hence the lst is set to 5.0
This doesn't match with the logic of
if data[i,j] > -900.: if lst[i,j] < -900.: lst[i,j] = data[i,j] elif lst[i,j] > -900.: lst[i,j] = 5.0
as it would never get to the second branch under this logic.
Does anyone have any suggestions on how to match up the logic?
np.where(data>-900,np.where(lst<-900,data,5.),lst) Note that this should fail if lst=-900

Thanks. Agreed it will break down under that scenario but that shouldn't be encountered as I am simply checking the value is greater than what I have set the undefined to be (-999.0). Here is the refjigged logic using numpy functionality for anyone who it might help. Would appreciate any suggestions how I can further improve this. The code is significantly more efficient now (thankfully ;P)! #!/usr/bin/env python """ Average the daily LST values to make an 8 day product... Martin De Kauwe 26th November 2009 """ import sys, os, glob import numpy as np def averageEightDays(files, lst, count, numrows, numcols): day_count = 0 doy = 122 for fname in glob.glob(os.path.join(path, files)): current_file = fname.split('/')[8] year = current_file.split('_')[2][0:4] try: f = open(fname, 'rb') except IOError: print "Cannot open outfile for read", fname sys.exit(1) data = np.fromfile(f, dtype=np.float32).reshape(numrows, numcols) f.close() # copy the first file into the array... # increase the pixel count if we have some valid data at timestep 1 if day_count == 0: lst = data count = np.where(lst > -900.0, 1, 0) day_count += 1 # keep a running total of valid lst values in an 8 day sequence # need to check whether the lst pixel is valid elif day_count > 0 and day_count <= 7: lst = np.where(data > -900.0, np.where(lst < -900.0, data, lst + data), lst) count = np.where(data > -900.0, np.where(lst < -900.0, 1, count + 1), count) day_count += 1 # need to average previous 8 days and write to a file... if day_count == 8: print year, ':', doy lst = np.where(count > 0, lst / np.float32(count), -999.0) day_count = 0 doy += 8 lst, count = write_outputs(lst, count, year, doy) # it is possible we didn't have enough slices to do the last 8day avg... # but if we have more than one day we shall use it # need to average previous 8 days and write to a file... if day_count > 1: lst = np.where(count > 0, lst / np.float32(count), -999.0) day_count = 0 doy += 8 lst, count = write_outputs(lst, count, year, doy) def write_outputs(lst, count, year, doy): path = "/users/eow/mgdk/research/HOFF_plots/LST/8dayLST" outfile = "lst_8day1030am_" + str(year) + str(doy) + ".gra" try: of = open(os.path.join(path, outfile), 'wb') except IOError: print "Cannot open outfile for write", outfile sys.exit(1) outfile2 = "pixelcount_8day1030am_" + str(year) + str(doy) + ".gra" try: of2 = open(os.path.join(path, outfile2), 'wb') except IOError: print "Cannot open outfile for write", outfile2 sys.exit(1) # empty stuff lst.tofile(of) count.tofile(of2) of.close() of2.close() lst = np.zeros((numrows, numcols), dtype=np.float32) count = np.zeros((numrows, numcols), dtype=np.int) return lst, count if __name__ == "__main__": numrows = 332 numcols = 667 path = "/users/eow/mgdk/research/HOFF_plots/LST/gridded_03/" lst = np.zeros((numrows, numcols), dtype=np.float32) count = np.zeros((numrows, numcols), dtype=np.int) averageEightDays('lst_scr_2006*.gra', lst, count, numrows, numcols) lst = np.zeros((numrows, numcols), dtype=np.float32) count = np.zeros((numrows, numcols), dtype=np.int) averageEightDays('lst_scr_2007*.gra', lst, count, numrows, numcols) Pierre GM-2 wrote:
On Nov 25, 2009, at 4:13 PM, mdekauwe wrote:
I tried redoing the internal logic for example by using the where function but I can't seem to work out how to match up the logic. For example (note slightly different from above). If I change the main loop to
lst = np.where((data > -900.0) & (lst < -900.0), data, lst) lst = np.where((data > -900.0) & (lst > -900.0), 5.0, lst)
If I then had a data array value of 10.0 and lst array value of -999.0. In this scenario the first statement would set the LST value to 10.0. However when you run the second statement, data is still bigger than the undefined (-999.0) but now so is the LST, hence the lst is set to 5.0
This doesn't match with the logic of
if data[i,j] > -900.: if lst[i,j] < -900.: lst[i,j] = data[i,j] elif lst[i,j] > -900.: lst[i,j] = 5.0
as it would never get to the second branch under this logic.
Does anyone have any suggestions on how to match up the logic?
np.where(data>-900,np.where(lst<-900,data,5.),lst)
Note that this should fail if lst=-900 _______________________________________________ NumPy-Discussion mailing list NumPy-Discussion@scipy.org http://mail.scipy.org/mailman/listinfo/numpy-discussion
-- View this message in context: http://old.nabble.com/Help-making-better-use-of-numpy-array-functions-tp2650... Sent from the Numpy-discussion mailing list archive at Nabble.com.

Hi mdekauwe, as long as your data size is small enough to fit a 8x array in memory and use it, why not just skip the running total and average 8 data arrays each 8day period? And skip the x and y loops; these are real performance killers. As a bonus, your code gets a lot smaller and more readeable... :) Please correct me is I'm wrong: the ultimate goal is to have a average of the valid pixels 8 days of data, each 8 days, correct? I'd do it this way (more or less pythonic pseudocode, untested, but your should get the idea): # read filenames filenames = glob.glob..... filenames.sort() # to be sure they are in proper order filenamesChunked = [filenames[n:n+8] for n in range(0, len(filenames, 8)] # this will produce a list of lists, with each sublist containing the # next 8 filenames nodatavalue = -999 for fchunk in filenamesChunked: data8days = numpy.ones((8, ncols, nrows)) * -999 # fill with nodata values, in case there are less than 8 days for di in range(len(fchunk)): f = fchunk[di] data8days[di] = <read f> weights = data8days > nodatavalue # this way all nodata values get a weight of 0 avg8days = numpy.average(data8days, axis=0, weights=weights) <save avg8days> uhm... well, that's it, really :) best regards, Vincent. mdekauwe wrote:
Hi I have written some code and I would appreciate any suggestions to make better use of the numpy arrays functions to make it a bit more efficient and less of a port from C. Any tricks are thoughts would be much appreciated.
The code reads in a series of images, collects a running total if the value is valid and averages everything every 8 images.
Code below...
Thanks in advance...
#!/usr/bin/env python
""" Average the daily LST values to make an 8 day product...
Martin De Kauwe 18th November 2009 """ import sys, os, glob import numpy as np import matplotlib.pyplot as plt
def averageEightDays(files, lst, count, numrows, numcols):
day_count = 0 # starting day - tag for output file doy = 122
# loop over all the images and average all the information # every 8 days... for fname in glob.glob(os.path.join(path, files)): current_file = fname.split('/')[8] year = current_file.split('_')[2][0:4]
try: f = open(fname, 'rb') except IOError: print "Cannot open outfile for read", fname sys.exit(1)
data = np.fromfile(f, dtype=np.float32).reshape(numrows, numcols) f.close()
# if it is day 1 then we need to fill up the holding array... # copy the first file into the array... if day_count == 0: lst = data for i in xrange(numrows): for j in xrange(numcols): # increase the pixel count if we got vaild data (undefined = -999.0 if lst[i,j] > -900.: count[i,j] = 1 day_count += 1
# keep a running total of valid lst values in an 8 day sequence elif day_count > 0 and day_count <= 7: for i in xrange(numrows): for j in xrange(numcols): # lst valid pixel? if data[i,j] > -900.: # was the existing pixel valid? if lst[i,j] < -900.: lst[i,j] = data[i,j] count[i,j] = 1 else: lst[i,j] += data[i,j] count[i,j] += 1 day_count += 1
# need to average previous 8 days and write to a file... if day_count == 8: for i in xrange(numrows): for j in xrange(numcols): if count[i,j] > 0: lst[i,j] = lst[i,j] / count[i,j] else: lst[i,j] = -999.0
day_count = 0 doy += 8
lst, count = write_outputs(lst, count, year, doy)
# it is possible we didn't have enough slices to do the last 8day avg... # but if we have more than one day we shall use it # need to average previous 8 days and write to a file... if day_count > 1: for i in xrange(numrows): for j in xrange(numcols): if count[i,j] > 0: lst[i,j] = lst[i,j] / count[i,j] else: lst[i,j] = -999.0
day_count = 0 doy += 8
lst, count = write_outputs(lst, count, year, doy)
def write_outputs(lst, count, year, doy): path = "/users/eow/mgdk/research/HOFF_plots/LST/8dayLST" outfile = "lst_8day1030am_" + str(year) + str(doy) + ".gra"
try: of = open(os.path.join(path, outfile), 'wb') except IOError: print "Cannot open outfile for write", outfile sys.exit(1)
outfile2 = "pixelcount_8day1030am_" + str(year) + str(doy) + ".gra" try: of2 = open(os.path.join(path, outfile2), 'wb') except IOError: print "Cannot open outfile for write", outfile2 sys.exit(1)
# empty stuff and zero 8day counts lst.tofile(of) count.tofile(of2) of.close() of2.close() lst = 0.0 count = 0.0
return lst, count
if __name__ == "__main__":
numrows = 332 numcols = 667
path = "/users/eow/mgdk/research/HOFF_plots/LST/gridded_03/" lst = np.zeros((numrows, numcols), dtype=np.float32) count = np.zeros((numrows, numcols), dtype=np.int) averageEightDays('lst_scr_2006*.gra', lst, count, numrows, numcols)
lst = 0.0 count = 0.0 averageEightDays('lst_scr_2007*.gra', lst, count, numrows, numcols)

Thanks...I have adopted that and as you said it is a lot neater! Though I need to keep the pixel count for a weighting in another piece of code so have amended your logic slightly. #!/usr/bin/env python import sys, os, glob import numpy as np def averageEightDays(filenamesList, numrows, numcols, year): doy = 122 nodatavalue = -999.0 for fchunk in filenamesList: # fill with nodata values, in case there are less than 8 days data8days = np.zeros((8, numrows, numcols), dtype=np.float32) * -999.0 pixCount = np.zeros((numrows, numcols), dtype=np.int) avg8days = np.zeros((numrows, numcols), dtype=np.float32) for day in xrange(len(fchunk)): f = fchunk[day] data8days[day] = np.fromfile(f, dtype=np.float32).reshape(numrows, numcols) avg8days = np.where(data8days[day] > nodatavalue, avg8days+data8days[day], avg8days) pixCount = np.where(data8days[day] > nodatavalue, pixCount+1, pixCount) avg8days = np.where(pixCount > 0, avg8days / np.float32(pixCount), -999.0) doy += 8 print year,':',doy outfile = "lst_8day1030am_" + str(year) + str(doy) + ".gra" write_outputs(outfile, avg8days) outfile = "pixelcount_8day1030am_" + str(year) + str(doy) + ".gra" write_outputs(outfile, pixCount) def write_outputs(outfile, data): opath = "/users/eow/mgdk/research/HOFF_plots/LST/tmp" try: of = open(os.path.join(opath, outfile), 'wb') except IOError: print "Cannot open outfile for write", outfile sys.exit(1) # empty stuff data.tofile(of) of.close() if __name__ == "__main__": numrows = 332 numcols = 667 path = "/users/eow/mgdk/research/HOFF_plots/LST/gridded_03/" files = 'lst_scr_2006*.gra' filenames = glob.glob(os.path.join(path, files)) filenames.sort() filenamesList = [filenames[n:n+8] for n in xrange(0, len(filenames), 8)] averageEightDays(filenamesList, numrows, numcols, year=2006) files = 'lst_scr_2007*.gra' filenames = glob.glob(os.path.join(path, files)) filenames.sort() filenamesList = [filenames[n:n+8] for n in xrange(0, len(filenames), 8)] averageEightDays(filenamesList, numrows, numcols, year=2007) -- View this message in context: http://old.nabble.com/Help-making-better-use-of-numpy-array-functions-tp2650... Sent from the Numpy-discussion mailing list archive at Nabble.com.

mdekauwe wrote:
Thanks...I have adopted that and as you said it is a lot neater! Though I need to keep the pixel count for a weighting in another piece of code so have amended your logic slightly.
Alternatively, you could simply take the sum over axis=0 of the weight array to get the pixel count (e.g. "pixelcount=weight.sum(axis=0)"). I don't know if performance is an issue, but I'd say that skipping these numpy.where's and increments of count and running total, and use numpy.average instead, should give you a speedup. Though the size of your arrays might be too small for it to become very noticeable. However, in your current version, it makes no sense to allocate an 8.nx.ny array to read the data, as you anyways calculate a running total. The 8,nx,ny array only makes sense to postpone calculations till you've read all 8 days, and only then calculate the average (and pixel count). Currently you preserve the previous days of data, but you don't use them anywhere anymore. Either way will work, though, I guess. Choose what feels most naturally for you, as that will help you maintain and understand your code later on. Oh, and minor issue: creating a array of zeros and then multiplying with -999 still makes an array of zeros... I'd incorporated an array of *ones* multiplied with -999, because for the last chunk of days you could end up with a 8day array only partly filled with real data. E.g. if you'd have only 3 days of data left in your last chunk, 8dayData[0:3] would be data, and to prevent the rest ([3:8]) to be incorporated into the average calculation, these need to be -999. Currently these pixels will be 0, which is > nodatavalue, and thus infuencing your average (the pixelcount will be incremented for these 0 values). Vincent.
#!/usr/bin/env python
import sys, os, glob import numpy as np
def averageEightDays(filenamesList, numrows, numcols, year): doy = 122 nodatavalue = -999.0
for fchunk in filenamesList: # fill with nodata values, in case there are less than 8 days data8days = np.zeros((8, numrows, numcols), dtype=np.float32) * -999.0 pixCount = np.zeros((numrows, numcols), dtype=np.int) avg8days = np.zeros((numrows, numcols), dtype=np.float32) for day in xrange(len(fchunk)): f = fchunk[day] data8days[day] = np.fromfile(f, dtype=np.float32).reshape(numrows, numcols) avg8days = np.where(data8days[day] > nodatavalue, avg8days+data8days[day], avg8days) pixCount = np.where(data8days[day] > nodatavalue, pixCount+1, pixCount)
avg8days = np.where(pixCount > 0, avg8days / np.float32(pixCount), -999.0) doy += 8 print year,':',doy
outfile = "lst_8day1030am_" + str(year) + str(doy) + ".gra" write_outputs(outfile, avg8days) outfile = "pixelcount_8day1030am_" + str(year) + str(doy) + ".gra" write_outputs(outfile, pixCount)
def write_outputs(outfile, data): opath = "/users/eow/mgdk/research/HOFF_plots/LST/tmp"
try: of = open(os.path.join(opath, outfile), 'wb') except IOError: print "Cannot open outfile for write", outfile sys.exit(1)
# empty stuff data.tofile(of) of.close()
if __name__ == "__main__":
numrows = 332 numcols = 667 path = "/users/eow/mgdk/research/HOFF_plots/LST/gridded_03/"
files = 'lst_scr_2006*.gra' filenames = glob.glob(os.path.join(path, files)) filenames.sort() filenamesList = [filenames[n:n+8] for n in xrange(0, len(filenames), 8)] averageEightDays(filenamesList, numrows, numcols, year=2006)
files = 'lst_scr_2007*.gra' filenames = glob.glob(os.path.join(path, files)) filenames.sort() filenamesList = [filenames[n:n+8] for n in xrange(0, len(filenames), 8)] averageEightDays(filenamesList, numrows, numcols, year=2007)

Vincent Schut-2 wrote:
Oh, and minor issue: creating a array of zeros and then multiplying with -999 still makes an array of zeros... I'd incorporated an array of *ones* multiplied with -999, because for the last chunk of days you could end up with a 8day array only partly filled with real data. E.g. if you'd have only 3 days of data left in your last chunk, 8dayData[0:3] would be data, and to prevent the rest ([3:8]) to be incorporated into the average calculation, these need to be -999. Currently these pixels will be 0, which is > nodatavalue, and thus infuencing your average (the pixelcount will be incremented for these 0 values).
Ok I hadn't thought about it in that way but you are of course right! I have amended it. Vincent Schut-2 wrote:
Alternatively, you could simply take the sum over axis=0 of the weight array to get the pixel count (e.g. "pixelcount=weight.sum(axis=0)").
Ok I see your point here as well. So I tried implementing your suggestion, as I understand it weights = data8days > nodatavalue will make and 8, nrows, ncols array containing true and false. as you said I can get the pixel count I was after by using weights.sum(axis=0). However when I try to do the averaging step: avg8days = np.average(data8days, axis=0, weights=weights) I get the error msg " in average raise ZeroDivisionError, "Weights sum to zero, can't be normalized" ZeroDivisionError: Weights sum to zero, can't be normalized" Which I guess (but don't know) comes from the trying do weight by a pixel count of zero. So not sure what has happened here? Thanks -- View this message in context: http://old.nabble.com/Help-making-better-use-of-numpy-array-functions-tp2650... Sent from the Numpy-discussion mailing list archive at Nabble.com.

mdekauwe wrote:
Vincent Schut-2 wrote:
Oh, and minor issue: creating a array of zeros and then multiplying with -999 still makes an array of zeros... I'd incorporated an array of *ones* multiplied with -999, because for the last chunk of days you could end up with a 8day array only partly filled with real data. E.g. if you'd have only 3 days of data left in your last chunk, 8dayData[0:3] would be data, and to prevent the rest ([3:8]) to be incorporated into the average calculation, these need to be -999. Currently these pixels will be 0, which is > nodatavalue, and thus infuencing your average (the pixelcount will be incremented for these 0 values).
Ok I hadn't thought about it in that way but you are of course right! I have amended it.
Vincent Schut-2 wrote:
Alternatively, you could simply take the sum over axis=0 of the weight array to get the pixel count (e.g. "pixelcount=weight.sum(axis=0)").
Ok I see your point here as well. So I tried implementing your suggestion, as I understand it
weights = data8days > nodatavalue
will make and 8, nrows, ncols array containing true and false.
as you said I can get the pixel count I was after by using weights.sum(axis=0).
However when I try to do the averaging step:
avg8days = np.average(data8days, axis=0, weights=weights)
I get the error msg " in average raise ZeroDivisionError, "Weights sum to zero, can't be normalized" ZeroDivisionError: Weights sum to zero, can't be normalized"
Which I guess (but don't know) comes from the trying do weight by a pixel count of zero. So not sure what has happened here?
Thanks
Ah... apparently numpy.average can't handle the situation when all weights are 0... didn't know that. Hmm... what would you want to have in your average array when for a certain pixel there are only nodata values? If you'd like to have -999 in your average result then, the solution is simple: in the weight array, for those pixels where weight is always 0, set 1 dayweight to 1. this way you'll get a nice -999 in your average result for those pixels. e.g.: weights[0] |= (weights.sum(axis=0)==0) will set (boolean OR assign) all pixels with weight==0 for all days to True (==1). Hope this helps. Vincent.

Yep that will do nicely, code becomes import sys, os, glob import numpy as np def averageEightDays(files, numrows, numcols, year, doy): """ Read in 8 files at a time, sum the valid LST, keep a count of the valid pixels and average the result every 8days. """ nodatavalue = -999.0 # break the files up into chunks of 8 filenames = glob.glob(os.path.join(path, files)) filenames.sort() filenamesList = [filenames[n:n+8] for n in xrange(0, len(filenames), 8)] for fchunk in filenamesList: # fill with nodata values, in case there are less than 8 days data8days = np.ones((8, numrows, numcols), dtype=np.float32) * -999.0 avg8days = np.zeros((numrows, numcols), dtype=np.float32) for day in xrange(len(fchunk)): fname = fchunk[day] try: f = open(fname, 'rb') except IOError: print "Cannot open outfile for read", fname sys.exit(1) data8days[day] = np.fromfile(f, dtype=np.float32).reshape(numrows, numcols) # build an array (ndays, nrow, ncols) of True and False for the pixel count # when these are summed we get the relative contributions weights = data8days > nodatavalue # np.average doesn't accept a weight of zero, COMMENT ME weights[0] |= (weights.sum(axis=0) == 0) pixelCount = weights.sum(axis=0) avg8days = np.average(data8days, axis=0, weights=weights) doy += 8 #print year,':',doy outfile = "lst_8day1030am_" + str(year) + str(doy) + ".gra" write_outputs(outfile, avg8days) outfile = "pixelcount_8day1030am_" + str(year) + str(doy) + ".gra" write_outputs(outfile, pixelCount) def write_outputs(outfile, data): opath = "/users/eow/mgdk/research/HOFF_plots/LST/8dayLST" try: of = open(os.path.join(opath, outfile), 'wb') except IOError: print "Cannot open outfile for write", outfile sys.exit(1) # empty stuff data.tofile(of) of.close() if __name__ == "__main__": numrows = 332 numcols = 667 path = "/users/eow/mgdk/research/HOFF_plots/LST/gridded_03/" averageEightDays('lst_scr_2006*.gra', numrows, numcols, year=2006, doy=122) averageEightDays('lst_scr_2007*.gra', numrows, numcols, year=2007, doy=122) -- View this message in context: http://old.nabble.com/Help-making-better-use-of-numpy-array-functions-tp2650... Sent from the Numpy-discussion mailing list archive at Nabble.com.
participants (4)
-
josef.pktd@gmail.com
-
mdekauwe
-
Pierre GM
-
Vincent Schut