"""Yearly sunspots data 1700-2008""" __docformat__ = 'restructuredtext' COPYRIGHT = """This data is public domain.""" TITLE = __doc__ SOURCE = """ http://www.ngdc.noaa.gov/stp/solar/solarda3.html The original dataset contains monthly data on sunspot activity in the file ./src/sunspots_yearly.dat. There is also sunspots_monthly.dat. """ DESCRSHORT = """Yearly (1700-2008) data on sunspots from the National Geophysical Data Center.""" DESCRLONG = DESCRSHORT NOTE = """:: Number of Observations - 309 (Annual 1700 - 2008) Number of Variables - 1 Variable name definitions:: SUNACTIVITY - Number of sunspots for each year The data file contains a 'YEAR' variable that is not returned by load. """ from numpy import recfromtxt, column_stack, array from pandas import Series, DataFrame from statsmodels.datasets.utils import Dataset from os.path import dirname, abspath def load(): """ Load the yearly sunspot data and returns a data class. Returns -------- Dataset instance: See DATASET_PROPOSAL.txt for more information. Notes ----- This dataset only contains data for one variable, so the attributes data, raw_data, and endog are all the same variable. There is no exog attribute defined. """ data = _get_data() endog_name = 'SUNACTIVITY' endog = array(data[endog_name], dtype=float) dataset = Dataset(data=data, names=[endog_name], endog=endog, endog_name=endog_name) return dataset def load_pandas(): data = DataFrame(_get_data()) # TODO: time series endog = Series(data['SUNACTIVITY'], index=data['YEAR'].astype(int)) dataset = Dataset(data=data, names=list(data.columns), endog=endog, endog_name='volume') return dataset def _get_data(): filepath = dirname(abspath(__file__)) data = recfromtxt(open(filepath + '/sunspots.csv', 'rb'), delimiter=",", names=True, dtype=float) return data