Find Daily max - create lists using date and add hourly data to that list for the day

I have hourly 2D temperature data in a monthly netcdf and I would like to find the daily maximum temperature. The shape of the netcdf is (744, 106, 193) I would like to use the year-month-day as a new list name (i.e. 2009-03-01, 2009-03-02....2009-03-31) and then add each of the hours worth of temperature data to each corresponding list. Therefore each new list should contain 24 hours worth of data and the shape should be (24,106,193) . This is the part I cannot seem to get to work. I am using datetime and then groupby to group by date but I am not sure how to use the output to make a new list name and then add the data for that day into that list. see below and attached for my latest attempt. Any feedback will be greatly appreciated. from netCDF4 import Dataset import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.basemap import Basemap from netcdftime import utime from datetime import datetime as dt import os import gc from numpy import * import pytz from itertools import groupby MainFolder=r"/DATA/2009/03" dailydate=[] alltime=[] lists={} ncvariablename='T_SFC' for (path, dirs, files) in os.walk(MainFolder): for ncfile in files: print ncfile fileext='.nc' if ncfile.endswith(ncvariablename+'.nc'): print "dealing with ncfiles:", path+ncfile ncfile=os.path.join(path,ncfile) ncfile=Dataset(ncfile, 'r+', 'NETCDF4') variable=ncfile.variables[ncvariablename][:,:,:] TIME=ncfile.variables['time'][:] ncfile.close() for temp, time in zip((variable[:]),(TIME[:])): cdftime=utime('seconds since 1970-01-01 00:00:00') ncfiletime=cdftime.num2date(time) timestr=str(ncfiletime) utc_dt = dt.strptime(timestr, '%Y-%m-%d %H:%M:%S') au_tz = pytz.timezone('Australia/Sydney') local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(au_tz) alltime.append(local_dt) for k, g in groupby(alltime, key=lambda d: d.date()): kstrp_local=k.strftime('%Y-%m-%d_%H') klocal_date=k.strftime('%Y-%m-%d') dailydate.append(klocal_date) for n in dailydate: lists[n]=[] lists[n].append(temp) big_array=np.ma.concatenate(lists[n]) DailyTemp=big_array.max(axis=0)

Hello anonymous, I recently wrote a package "xray" (http://xray.readthedocs.org/) specifically to make it easier to work with high-dimensional labeled data, as often found in NetCDF files. Xray has a groupby method for grouping over subsets of your data, which would seem well suited to what you're trying to do. Something like the following might work: ds = xray.open_dataset(ncfile) tmax = ds['temperature'].groupby('time.hour').max() It also might be worth looking at other more data analysis packages, either more generic (e.g., pandas, http://pandas.pydata.org/) or weather/climate data specific (e.g., Iris, http://scitools.org.uk/iris/ and CDAT, http://www2-pcmdi.llnl.gov/cdat/manuals/cdutil/cdat_utilities.html). Cheers, Stephan On Wed, May 21, 2014 at 5:27 PM, questions anon <questions.anon@gmail.com>wrote:
I have hourly 2D temperature data in a monthly netcdf and I would like to find the daily maximum temperature. The shape of the netcdf is (744, 106, 193)
I would like to use the year-month-day as a new list name (i.e. 2009-03-01, 2009-03-02....2009-03-31) and then add each of the hours worth of temperature data to each corresponding list. Therefore each new list should contain 24 hours worth of data and the shape should be (24,106,193) . This is the part I cannot seem to get to work. I am using datetime and then groupby to group by date but I am not sure how to use the output to make a new list name and then add the data for that day into that list. see below and attached for my latest attempt. Any feedback will be greatly appreciated.
from netCDF4 import Dataset
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from netcdftime import utime
from datetime import datetime as dt
import os
import gc
from numpy import *
import pytz
from itertools import groupby
MainFolder=r"/DATA/2009/03"
dailydate=[]
alltime=[]
lists={}
ncvariablename='T_SFC'
for (path, dirs, files) in os.walk(MainFolder):
for ncfile in files:
print ncfile
fileext='.nc'
if ncfile.endswith(ncvariablename+'.nc'):
print "dealing with ncfiles:", path+ncfile
ncfile=os.path.join(path,ncfile)
ncfile=Dataset(ncfile, 'r+', 'NETCDF4')
variable=ncfile.variables[ncvariablename][:,:,:]
TIME=ncfile.variables['time'][:]
ncfile.close()
for temp, time in zip((variable[:]),(TIME[:])):
cdftime=utime('seconds since 1970-01-01 00:00:00')
ncfiletime=cdftime.num2date(time)
timestr=str(ncfiletime)
utc_dt = dt.strptime(timestr, '%Y-%m-%d %H:%M:%S')
au_tz = pytz.timezone('Australia/Sydney')
local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(au_tz)
alltime.append(local_dt)
for k, g in groupby(alltime, key=lambda d: d.date()):
kstrp_local=k.strftime('%Y-%m-%d_%H')
klocal_date=k.strftime('%Y-%m-%d')
dailydate.append(klocal_date)
for n in dailydate:
lists[n]=[]
lists[n].append(temp)
big_array=np.ma.concatenate(lists[n])
DailyTemp=big_array.max(axis=0)
_______________________________________________ NumPy-Discussion mailing list NumPy-Discussion@scipy.org http://mail.scipy.org/mailman/listinfo/numpy-discussion

Thanks Stephan, It doesn't look like CDAT has 'daily' option - it has yearly, seasonal and monthly! I would need to look into IRIS more as it is new to me and I can't quiet figure out all the steps required for xray, although it looks great. Another way around was after converting to localtime_day I could append the corresponding hourly arrays to a list, concatenate, calculate max and make the max equal to that localtime_day. Then I could delete everything in that list and repeat by looping though the hours of the next day and append to the empty list. Although I really don't know how to get this to work. On Thu, May 22, 2014 at 10:56 AM, Stephan Hoyer <shoyer@gmail.com> wrote:
Hello anonymous,
I recently wrote a package "xray" (http://xray.readthedocs.org/) specifically to make it easier to work with high-dimensional labeled data, as often found in NetCDF files. Xray has a groupby method for grouping over subsets of your data, which would seem well suited to what you're trying to do. Something like the following might work:
ds = xray.open_dataset(ncfile) tmax = ds['temperature'].groupby('time.hour').max()
It also might be worth looking at other more data analysis packages, either more generic (e.g., pandas, http://pandas.pydata.org/) or weather/climate data specific (e.g., Iris, http://scitools.org.uk/iris/and CDAT, http://www2-pcmdi.llnl.gov/cdat/manuals/cdutil/cdat_utilities.html).
Cheers, Stephan
On Wed, May 21, 2014 at 5:27 PM, questions anon <questions.anon@gmail.com>wrote:
I have hourly 2D temperature data in a monthly netcdf and I would like to find the daily maximum temperature. The shape of the netcdf is (744, 106, 193)
I would like to use the year-month-day as a new list name (i.e. 2009-03-01, 2009-03-02....2009-03-31) and then add each of the hours worth of temperature data to each corresponding list. Therefore each new list should contain 24 hours worth of data and the shape should be (24,106,193) . This is the part I cannot seem to get to work. I am using datetime and then groupby to group by date but I am not sure how to use the output to make a new list name and then add the data for that day into that list. see below and attached for my latest attempt. Any feedback will be greatly appreciated.
from netCDF4 import Dataset
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from netcdftime import utime
from datetime import datetime as dt
import os
import gc
from numpy import *
import pytz
from itertools import groupby
MainFolder=r"/DATA/2009/03"
dailydate=[]
alltime=[]
lists={}
ncvariablename='T_SFC'
for (path, dirs, files) in os.walk(MainFolder):
for ncfile in files:
print ncfile
fileext='.nc'
if ncfile.endswith(ncvariablename+'.nc'):
print "dealing with ncfiles:", path+ncfile
ncfile=os.path.join(path,ncfile)
ncfile=Dataset(ncfile, 'r+', 'NETCDF4')
variable=ncfile.variables[ncvariablename][:,:,:]
TIME=ncfile.variables['time'][:]
ncfile.close()
for temp, time in zip((variable[:]),(TIME[:])):
cdftime=utime('seconds since 1970-01-01 00:00:00')
ncfiletime=cdftime.num2date(time)
timestr=str(ncfiletime)
utc_dt = dt.strptime(timestr, '%Y-%m-%d %H:%M:%S')
au_tz = pytz.timezone('Australia/Sydney')
local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(au_tz)
alltime.append(local_dt)
for k, g in groupby(alltime, key=lambda d: d.date()):
kstrp_local=k.strftime('%Y-%m-%d_%H')
klocal_date=k.strftime('%Y-%m-%d')
dailydate.append(klocal_date)
for n in dailydate:
lists[n]=[]
lists[n].append(temp)
big_array=np.ma.concatenate(lists[n])
DailyTemp=big_array.max(axis=0)
_______________________________________________ NumPy-Discussion mailing list NumPy-Discussion@scipy.org http://mail.scipy.org/mailman/listinfo/numpy-discussion
_______________________________________________ NumPy-Discussion mailing list NumPy-Discussion@scipy.org http://mail.scipy.org/mailman/listinfo/numpy-discussion
participants (2)
-
questions anon
-
Stephan Hoyer