# -*- coding: utf-8 -*-
#initialize a (very) simple caching mechanism for search results
import pandas as pd, requests
country_cache = pd.DataFrame()
''' Cache for countries data '''
database_cache = pd.DataFrame()
''' Cache for databases data '''
[docs]def country_search(keyword, regex = False):
"""
Function to identify country codes and names from a keyword seach.
It also returns codes from aggregated spatial units.
The function returns a pandas dataframe of country codes and countries matching the search,
which can be done by normal string methods (the default) or a regular expression
The search operates on cached data or calls country_codes if the cache is empty.
Parameters
----------
keyword : str or regular expression
The keyword or regular expression to search. Not case-sensitive.
regex : bool (optional), default=False
Whether the keyword should be searched as a regular expression.
Defaults to False, in which case normal string matching is used.
Returns
-------
match : pandas.core.frame.DataFrame
A DataFrame of matched results, including country code and country.
Examples
--------
>>> searches.country_search("germany")
Returns keyword matches for simple string search "germany"
>>> searches.country_search("^B.*a$", regex=True)
Returns matches for countries starting with B and ending in a
"""
#Input data types and values- validation
assert isinstance(keyword, str),"Invalid inputs, please try again."
global country_cache
if country_cache.empty:
#get full list of countries if cache is empty
codes = country_codes()
else:
#otherwise just access the cached countries
codes = country_cache
#give the user the option to use regex to search if desired
if regex == False:
keyword = keyword.lower()
match = codes[codes['Country'].str.lower().str.contains(keyword, regex = False)]
else:
match = codes[codes['Country'].str.contains(keyword, regex = True)]
return match
[docs]def country_codes():
"""
Function returns a dataframe of all IMF countries and codes for which data can be accessed through the JSON API.
The resulting dataframe is cached to the local environment using a simple technique.
Parameters
----------
None
Returns
-------
database_cache : pandas.core.frame.DataFrame
A DataFrame of all country names and codes, cached to local environment.
Examples
--------
>>> searches.country_codes()
Returns country codes and caches to local environment.
"""
#only request data if it hasn't been cached
global country_cache
if country_cache.empty:
#import libraries and define base URL for API
start_url = "http://dataservices.imf.org/REST/SDMX_JSON.svc/"
# send the get request, use the DOTS database as the database_id
database_id = 'DOT'
r = requests.get(f'{start_url}DataStructure/{database_id}')
print(r)
# assert the response was 200 (OK)
assert r.status_code==200, "Error - HTTP request was unsuccessful."
#convert the data to subscriptable json
data_json = r.json()
#get codelist (which contains countries)
country_codelist = data_json['Structure']['CodeLists']['CodeList'][2]['Code']
#get list of countries and codes with a list comprehension
codes = [country['@value'] for country in country_codelist]
countries = [country['Description']['#text'] for country in country_codelist]
#cache the result to avoid running it again
country_cache = pd.DataFrame({"Country Code": codes, "Country": countries})
return country_cache
[docs]def database_codes():
"""
Function returns a dataframe of all IMF databases from which data can be accessed through the JSON API.
The resulting dataframe is cached to the local environment using a simple technique.
Parameters
----------
None
Returns
-------
database_cache : pandas.core.frame.DataFrame
A DataFrame of all Database names and codes, cached to local environment.
Examples
--------
>>> searches.database_codes()
Returns database codes and caches to local environment.
"""
#only request data if it hasn't been cached
global database_cache
if database_cache.empty:
#define IMF data services API start point
start_url = "http://dataservices.imf.org/REST/SDMX_JSON.svc"
#requests.get the full list of databases, convert to json
import requests
r = requests.get(f'{start_url}/Dataflow')
print(r)
#assert the response was 200 (OK)
assert r.status_code==200, "Error - HTTP request was unsuccessful."
#convert the data to subscriptable json
data_json = r.json()
#convert results to dataframe
df_temp = pd.DataFrame(data_json['Structure']['Dataflows']['Dataflow'])
#parse out columns that themselves contain multiple cols of data
parsed_Name = pd.DataFrame([database['Name'] for database in data_json['Structure']['Dataflows']['Dataflow']])
parsed_KeyFamilyRef = pd.DataFrame([database['KeyFamilyRef'] for database in data_json['Structure']['Dataflows']['Dataflow']])
#clean up dataframe columns
df_temp = df_temp.join(parsed_Name).join(parsed_KeyFamilyRef)[['@id', '#text']]
df_temp = df_temp.rename(columns={'@id': 'Database ID', '#text': 'Description'})
df_temp['Database ID'] = df_temp['Database ID'].str.replace("DS-","")
#store clean dataframe to global cache
database_cache = df_temp.sort_values('Database ID').reset_index(drop=True)
#return cache
return database_cache
[docs]def database_search(keyword, regex = False):
"""
Function to identify database codes and names from a keyword seach.
The function returns a pandas dataframe of database codes and databases matching the search,
which can be done by normal string methods (the default) or a regular expression
The search operates on cached data or calls database_codes if the cache is empty.
Parameters
----------
keyword : str or regular expression
The keyword or regular expression to search. Not case-sensitive.
regex : bool (optional), default=False
Whether the keyword should be searched as a regular expression.
Defaults to False, in which case normal string matching is used.
Returns
-------
match : pandas.core.frame.DataFrame
A DataFrame of matched results, including database code and database.
Examples
--------
>>> searches.database_search("development")
Returns keyword matches for simple string search "development"
>>> searches.database_search("^Financial.*", regex=True)
Returns matches for databases that start with "Financial"
"""
#Input data types and values- validation
assert isinstance(keyword, str),"Invalid inputs, please try again."
global database_cache
if database_cache.empty:
#get full list of countries if cache is empty
codes = database_codes()
else:
#otherwise just access the cached countries
codes = database_cache
#give the user the option to use regex to search if desired
if regex == False:
keyword = keyword.lower()
match = codes[codes['Description'].str.lower().str.contains(keyword, regex = False)]
else:
match = codes[codes['Description'].str.contains(keyword, regex = True)]
return match
[docs]def database_info(database_id):
"""
Returns the high-level information on a particular user-specified database.
Parameters
----------
database_id : str
The database ID of the database of interest.
Checks against database cache to validate input.
Returns
-------
info : pandas.core.frame.DataFrame
A DataFrame of information (update time, name, definition, methodology, etc.)
about the specified database.
Examples
--------
>>> searches.database_info('FSI')
Returns information about the database 'FSI' (Financial Soundness Indicators)
"""
global database_cache
if database_cache.empty:
#get full list of countries if cache is empty
codes = database_codes()
else:
#otherwise just access the cached countries
codes = database_cache
#check the database ID is valid before sending a request
assert codes['Database ID'].str.fullmatch(database_id).any(), "Invalid database. Please try again."
#define IMF data services API start point
start_url = "http://dataservices.imf.org/REST/SDMX_JSON.svc"
#send the get request
import requests
r = requests.get(f'{start_url}/DataStructure/{database_id}')
print(r)
#assert the response was 200 (OK)
assert r.status_code==200, "Error - HTTP request was unsuccessful."
#convert the data to subscriptable json
data_json = r.json()
#get info from annotations
annotations_json = data_json['Structure']['KeyFamilies']['KeyFamily']['Annotations']['Annotation']
#parse two columns: title and text
titles = [annotation['AnnotationTitle'] for annotation in annotations_json]
text_raw = [annotation['AnnotationText']['#text'] for annotation in annotations_json]
#clean html tags out of the text if they exist
import re
text_clean = [re.sub(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});','',text) for text in text_raw]
#return neat dataframe of database info
info = pd.DataFrame({'Variable':titles, 'Value':text_clean})
return info
[docs]def database_dimensions(database_id):
"""
This function returns the dimensions of a particular user-specified database.
Database dimensions are effectively indicator collections indexed by an indicator_id.
Parameters
----------
database_id : str
The database ID of the database of interest.
Checks against database cache to validate input.
Returns
-------
dimensions : pandas.core.frame.DataFrame
A DataFrame of dimensions (typically frequencies, spatial units and indicators)
that can be accessed through the specified database
Examples
--------
>>> searches.database_dimensions('FSI')
Returns dimensions of the database 'FSI' (Financial Soundness Indicators)
"""
global database_cache
if database_cache.empty:
#get full list of countries if cache is empty
codes = database_codes()
else:
#otherwise just access the cached countries
codes = database_cache
#check the database ID is valid before sending a request
assert codes['Database ID'].str.fullmatch(database_id).any(), "Invalid database. Please try again."
#define IMF data services API start point
start_url = "http://dataservices.imf.org/REST/SDMX_JSON.svc"
#send the get request
r = requests.get(f'{start_url}/DataStructure/{database_id}')
print(r)
#assert the response was 200 (OK)
assert r.status_code==200, "Error - HTTP request was unsuccessful."
#convert the data to subscriptable json
data_json = r.json()
#get info from KeyFamilies --> Components
dimensions_temp = data_json['Structure']['KeyFamilies']['KeyFamily']['Components']['Dimension']
#return neat dataframe of database dimensions, dropping redundant columns
dimensions_temp = pd.DataFrame(dimensions_temp)[['@conceptRef', '@conceptSchemeRef','@codelist']]
dimensions = dimensions_temp.rename(columns={'@conceptRef': 'Concept',
'@conceptSchemeRef': 'Scheme',
'@codelist': 'Indicator ID'})
#have a column for Database ID so user can keep track of search
dimensions.insert(0, "Database ID", database_id)
return dimensions
[docs]def indicator_dimensions(indicator_id):
"""
Function returns a dataframe of indicator dimensions and series IDs
(i.e. the most granular unit of data apart from individual values)
for a given user-specified indicator ID.
Indicator dimensions are single time series.
Parameters
----------
indicator_id : str
The indicator ID of the indicator of interest.
Returns
-------
indicator_dimensions : pandas.core.frame.DataFrame
A DataFrame of indicator dimensions or series that can be accessed via a particular indicator_id
Examples
--------
>>> searches.indicator_dimensions('CL_INDICATOR_FSI')
Returns indicators dimensions and series for the indicator ID 'CL_INDICATOR_FSI'
"""
#define IMF data services API start point
start_url = "http://dataservices.imf.org/REST/SDMX_JSON.svc"
# pull the data
r = requests.get(f'{start_url}/CodeList/{indicator_id}')
print(r)
#assert the response was 200 (OK)
assert r.status_code==200, "Error - HTTP request was unsuccessful."
#convert the data to subscriptable json
data_json = r.json()
#get codelist for that indicator
codelist = data_json['Structure']['CodeLists']['CodeList']['Code']
#get list of codes and descriptions with a list comprehension
codes = [code['@value'] for code in codelist]
descriptions = [code['Description']['#text'] for code in codelist]
#return neat dataframe of codes and descriptions
#have a column for Indicator ID so user can keep track of search
indicator_dimensions = pd.DataFrame({'Indicator ID':indicator_id,
'Series ID':codes,
'Description':descriptions})
return indicator_dimensions