# -*- coding: utf-8 -*- # flake8: noqa from __future__ import print_function from pandas.compat import map, reduce, range, lrange from pandas.io.common import urlopen from pandas.io import json import pandas import numpy as np import warnings warnings.warn("\n" "The pandas.io.wb module is moved to a separate package " "(pandas-datareader) and will be removed from pandas in a " "future version.\nAfter installing the pandas-datareader package " "(https://github.com/pydata/pandas-datareader), you can change " "the import ``from pandas.io import data, wb`` to " "``from pandas_datareader import data, wb``.", FutureWarning) # This list of country codes was pulled from wikipedia during October 2014. # While some exceptions do exist, it is the best proxy for countries supported # by World Bank. It is an aggregation of the 2-digit ISO 3166-1 alpha-2, and # 3-digit ISO 3166-1 alpha-3, codes, with 'all', 'ALL', and 'All' appended ot # the end. country_codes = ['AD', 'AE', 'AF', 'AG', 'AI', 'AL', 'AM', 'AO', 'AQ', 'AR', \ 'AS', 'AT', 'AU', 'AW', 'AX', 'AZ', 'BA', 'BB', 'BD', 'BE', \ 'BF', 'BG', 'BH', 'BI', 'BJ', 'BL', 'BM', 'BN', 'BO', 'BQ', \ 'BR', 'BS', 'BT', 'BV', 'BW', 'BY', 'BZ', 'CA', 'CC', 'CD', \ 'CF', 'CG', 'CH', 'CI', 'CK', 'CL', 'CM', 'CN', 'CO', 'CR', \ 'CU', 'CV', 'CW', 'CX', 'CY', 'CZ', 'DE', 'DJ', 'DK', 'DM', \ 'DO', 'DZ', 'EC', 'EE', 'EG', 'EH', 'ER', 'ES', 'ET', 'FI', \ 'FJ', 'FK', 'FM', 'FO', 'FR', 'GA', 'GB', 'GD', 'GE', 'GF', \ 'GG', 'GH', 'GI', 'GL', 'GM', 'GN', 'GP', 'GQ', 'GR', 'GS', \ 'GT', 'GU', 'GW', 'GY', 'HK', 'HM', 'HN', 'HR', 'HT', 'HU', \ 'ID', 'IE', 'IL', 'IM', 'IN', 'IO', 'IQ', 'IR', 'IS', 'IT', \ 'JE', 'JM', 'JO', 'JP', 'KE', 'KG', 'KH', 'KI', 'KM', 'KN', \ 'KP', 'KR', 'KW', 'KY', 'KZ', 'LA', 'LB', 'LC', 'LI', 'LK', \ 'LR', 'LS', 'LT', 'LU', 'LV', 'LY', 'MA', 'MC', 'MD', 'ME', \ 'MF', 'MG', 'MH', 'MK', 'ML', 'MM', 'MN', 'MO', 'MP', 'MQ', \ 'MR', 'MS', 'MT', 'MU', 'MV', 'MW', 'MX', 'MY', 'MZ', 'NA', \ 'NC', 'NE', 'NF', 'NG', 'NI', 'NL', 'NO', 'NP', 'NR', 'NU', \ 'NZ', 'OM', 'PA', 'PE', 'PF', 'PG', 'PH', 'PK', 'PL', 'PM', \ 'PN', 'PR', 'PS', 'PT', 'PW', 'PY', 'QA', 'RE', 'RO', 'RS', \ 'RU', 'RW', 'SA', 'SB', 'SC', 'SD', 'SE', 'SG', 'SH', 'SI', \ 'SJ', 'SK', 'SL', 'SM', 'SN', 'SO', 'SR', 'SS', 'ST', 'SV', \ 'SX', 'SY', 'SZ', 'TC', 'TD', 'TF', 'TG', 'TH', 'TJ', 'TK', \ 'TL', 'TM', 'TN', 'TO', 'TR', 'TT', 'TV', 'TW', 'TZ', 'UA', \ 'UG', 'UM', 'US', 'UY', 'UZ', 'VA', 'VC', 'VE', 'VG', 'VI', \ 'VN', 'VU', 'WF', 'WS', 'YE', 'YT', 'ZA', 'ZM', 'ZW', \ 'ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE', \ 'ARG', 'ARM', 'ASM', 'ATA', 'ATF', 'ATG', 'AUS', 'AUT', \ 'AZE', 'BDI', 'BEL', 'BEN', 'BES', 'BFA', 'BGD', 'BGR', \ 'BHR', 'BHS', 'BIH', 'BLM', 'BLR', 'BLZ', 'BMU', 'BOL', \ 'BRA', 'BRB', 'BRN', 'BTN', 'BVT', 'BWA', 'CAF', 'CAN', \ 'CCK', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', \ 'COK', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CUW', 'CXR', \ 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', \ 'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', 'EST', 'ETH', \ 'FIN', 'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', \ 'GEO', 'GGY', 'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', \ 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUF', 'GUM', 'GUY', \ 'HKG', 'HMD', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', \ 'IND', 'IOT', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', \ 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', \ 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', \ 'LCA', 'LIE', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAC', \ 'MAF', 'MAR', 'MCO', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', \ 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MNP', 'MOZ', \ 'MRT', 'MSR', 'MTQ', 'MUS', 'MWI', 'MYS', 'MYT', 'NAM', \ 'NCL', 'NER', 'NFK', 'NGA', 'NIC', 'NIU', 'NLD', 'NOR', \ 'NPL', 'NRU', 'NZL', 'OMN', 'PAK', 'PAN', 'PCN', 'PER', \ 'PHL', 'PLW', 'PNG', 'POL', 'PRI', 'PRK', 'PRT', 'PRY', \ 'PSE', 'PYF', 'QAT', 'REU', 'ROU', 'RUS', 'RWA', 'SAU', \ 'SDN', 'SEN', 'SGP', 'SGS', 'SHN', 'SJM', 'SLB', 'SLE', \ 'SLV', 'SMR', 'SOM', 'SPM', 'SRB', 'SSD', 'STP', 'SUR', \ 'SVK', 'SVN', 'SWE', 'SWZ', 'SXM', 'SYC', 'SYR', 'TCA', \ 'TCD', 'TGO', 'THA', 'TJK', 'TKL', 'TKM', 'TLS', 'TON', \ 'TTO', 'TUN', 'TUR', 'TUV', 'TWN', 'TZA', 'UGA', 'UKR', \ 'UMI', 'URY', 'USA', 'UZB', 'VAT', 'VCT', 'VEN', 'VGB', \ 'VIR', 'VNM', 'VUT', 'WLF', 'WSM', 'YEM', 'ZAF', 'ZMB', \ 'ZWE', 'all', 'ALL', 'All'] def download(country=None, indicator=None, start=2003, end=2005,errors='warn'): """ Download data series from the World Bank's World Development Indicators Parameters ---------- indicator: string or list of strings taken from the ``id`` field in ``WDIsearch()`` country: string or list of strings. ``all`` downloads data for all countries 2 or 3 character ISO country codes select individual countries (e.g.``US``,``CA``) or (e.g.``USA``,``CAN``). The codes can be mixed. The two ISO lists of countries, provided by wikipedia, are hardcoded into pandas as of 11/10/2014. start: int First year of the data series end: int Last year of the data series (inclusive) errors: str {'ignore', 'warn', 'raise'}, default 'warn' Country codes are validated against a hardcoded list. This controls the outcome of that validation, and attempts to also apply to the results from world bank. errors='raise', will raise a ValueError on a bad country code. Returns ------- ``pandas`` DataFrame with columns: country, iso_code, year, indicator value. """ if country is None: country = ['MX', 'CA', 'US'] if indicator is None: indicator = ['NY.GDP.MKTP.CD', 'NY.GNS.ICTR.ZS'] if type(country) == str: country = [country] bad_countries = np.setdiff1d(country, country_codes) # Validate the input if len(bad_countries) > 0: tmp = ", ".join(bad_countries) if errors == 'raise': raise ValueError("Invalid Country Code(s): %s" % tmp) if errors == 'warn': warnings.warn('Non-standard ISO country codes: %s' % tmp) # Work with a list of indicators if type(indicator) == str: indicator = [indicator] # Download data = [] bad_indicators = {} for ind in indicator: one_indicator_data,msg = _get_data(ind, country, start, end) if msg == "Success": data.append(one_indicator_data) else: bad_indicators[ind] = msg if len(bad_indicators.keys()) > 0: bad_ind_msgs = [i + " : " + m for i,m in bad_indicators.items()] bad_ind_msgs = "\n\n".join(bad_ind_msgs) bad_ind_msgs = "\n\nInvalid Indicators:\n\n%s" % bad_ind_msgs if errors == 'raise': raise ValueError(bad_ind_msgs) if errors == 'warn': warnings.warn(bad_ind_msgs) # Confirm we actually got some data, and build Dataframe if len(data) > 0: out = reduce(lambda x, y: x.merge(y, how='outer'), data) out = out.drop('iso_code', axis=1) out = out.set_index(['country', 'year']) out = out._convert(datetime=True, numeric=True) return out else: msg = "No indicators returned data." if errors == 'ignore': msg += " Set errors='warn' for more information." raise ValueError(msg) def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country='US', start=2002, end=2005): if type(country) == str: country = [country] countries = ';'.join(country) # Build URL for api call url = ("http://api.worldbank.org/countries/" + countries + "/indicators/" + indicator + "?date=" + str(start) + ":" + str(end) + "&per_page=25000&format=json") # Download with urlopen(url) as response: data = response.read() # Check to see if there is a possible problem possible_message = json.loads(data)[0] if 'message' in possible_message.keys(): msg = possible_message['message'][0] try: msg = msg['key'].split() + ["\n "] + msg['value'].split() wb_err = ' '.join(msg) except: wb_err = "" if 'key' in msg.keys(): wb_err = msg['key'] + "\n " if 'value' in msg.keys(): wb_err += msg['value'] error_msg = "Problem with a World Bank Query \n %s" return None, error_msg % wb_err if 'total' in possible_message.keys(): if possible_message['total'] == 0: return None, "No results from world bank." # Parse JSON file data = json.loads(data)[1] country = [x['country']['value'] for x in data] iso_code = [x['country']['id'] for x in data] year = [x['date'] for x in data] value = [x['value'] for x in data] # Prepare output out = pandas.DataFrame([country, iso_code, year, value]).T out.columns = ['country', 'iso_code', 'year', indicator] return out,"Success" def get_countries(): """Query information about countries """ url = 'http://api.worldbank.org/countries/?per_page=1000&format=json' with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) data.adminregion = [x['value'] for x in data.adminregion] data.incomeLevel = [x['value'] for x in data.incomeLevel] data.lendingType = [x['value'] for x in data.lendingType] data.region = [x['value'] for x in data.region] data = data.rename(columns={'id': 'iso3c', 'iso2Code': 'iso2c'}) return data def get_indicators(): """Download information about all World Bank data series """ url = 'http://api.worldbank.org/indicators?per_page=50000&format=json' with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) # Clean fields data.source = [x['value'] for x in data.source] fun = lambda x: x.encode('ascii', 'ignore') data.sourceOrganization = data.sourceOrganization.apply(fun) # Clean topic field def get_value(x): try: return x['value'] except: return '' fun = lambda x: [get_value(y) for y in x] data.topics = data.topics.apply(fun) data.topics = data.topics.apply(lambda x: ' ; '.join(x)) # Clean outpu data = data.sort(columns='id') data.index = pandas.Index(lrange(data.shape[0])) return data _cached_series = None def search(string='gdp.*capi', field='name', case=False): """ Search available data series from the world bank Parameters ---------- string: string regular expression field: string id, name, source, sourceNote, sourceOrganization, topics See notes below case: bool case sensitive search? Notes ----- The first time this function is run it will download and cache the full list of available series. Depending on the speed of your network connection, this can take time. Subsequent searches will use the cached copy, so they should be much faster. id : Data series indicator (for use with the ``indicator`` argument of ``WDI()``) e.g. NY.GNS.ICTR.GN.ZS" name: Short description of the data series source: Data collection project sourceOrganization: Data collection organization note: sourceNote: topics: """ # Create cached list of series if it does not exist global _cached_series if type(_cached_series) is not pandas.core.frame.DataFrame: _cached_series = get_indicators() data = _cached_series[field] idx = data.str.contains(string, case=case) out = _cached_series.ix[idx].dropna() return out