"""Provides convenient access to data viz challenge data. Source: https://github.com/localytics/data-viz-challenge This dataset is excellent for testing and demonstrating data viz capabilities because it contains numerous categorical columns, with both high and low cardinality, columns with NaN values, dates and locations. This is a very good example of the kind of data that you might see from an information system, where the analyst might be simply helping visualize the data (business intelligence), or trying to understand how to exploit the data for better system performance. This script will download the json data, only the first time imported from, then will load the data and clean it up in a pandas DataFrame. Resulting dataframe reports the following dtypes: age object amount float64 category object client_time datetime64[ns] device object event_name object gender object city object latitude float64 longitude float64 state object zip_code int64 marital_status object session_id object """ from __future__ import absolute_import from bokeh.util.dependencies import import_required pd = import_required('pandas', 'project_funding sample data requires Pandas (http://pandas.pydata.org) to be installed') import os from six.moves.urllib.request import URLopener from bokeh.charts.utils import df_from_json DATA_URL = "https://raw.githubusercontent.com/localytics/data-viz-challenge/master/data.json" DOWNLOAD_NAME = 'project_funding.json' CSV_NAME = 'project_funding.csv' # Get absolute path relative to script data_dir = os.path.dirname(os.path.realpath(__file__)) json_file_path = os.path.join(data_dir, DOWNLOAD_NAME) csv_file_path = os.path.join(data_dir, CSV_NAME) def download_project_funding(): if not os.path.isfile(json_file_path): print('Downloading project funding source data.') json_data = URLopener() json_data.retrieve(DATA_URL, json_file_path) print('Download complete!') def load_project_funding(): project_funding = df_from_json(json_file_path) # cleanup column names cols = project_funding.columns flat_cols = [col.split('.')[1] if '.' in col else col for col in cols] project_funding.columns = flat_cols # convert to dates project_funding['client_time'] = pd.to_datetime(project_funding['client_time'], unit='s') return project_funding def load_cached_funding(): if not os.path.isfile(csv_file_path): project_funding = load_project_funding() project_funding.to_csv(csv_file_path, index=False) else: project_funding = pd.read_csv(csv_file_path, parse_dates=['client_time']) return project_funding download_project_funding() project_funding = load_cached_funding()