import os
import json
import pandas as pd
import numpy as np
import requests
from datetime import datetime, timedelta
import plotly.plotly as py
import matplotlib.pyplot as plt
import cufflinks as cf
import statsmodels.api as sm
cf.go_offline()
%matplotlib inline
The code below is needed to retrieve google transparency data.
You can skip this section, just be sure to run it.
# The functions below are a python port of the official gwt library.
# see: https://github.com/gwtproject/gwt/blob/master/user/src/com/google/gwt/user/client/rpc/impl/AbstractSerializationStream.java
def long_from_base64(value):
pos = 0
long_val = base64_value(value[pos])
pos += 1
while pos < len(value):
long_val <<= 6
long_val |= base64_value(value[pos])
pos += 1
return long_val;
def long_to_base64(value):
# Convert to ints early to avoid need for long ops
low = value & 0xffffffff
high = value >> 32
sb = ''
hnz = False # Have Non zero
hnz, s = base64_digit((high >> 28) & 0xf, hnz)
sb += s
hnz, s = base64_digit((high >> 22) & 0x3f, hnz)
sb += s
hnz, s = base64_digit((high >> 16) & 0x3f, hnz)
sb += s
hnz, s = base64_digit((high >> 10) & 0x3f, hnz)
sb += s
hnz, s = base64_digit((high >> 4) & 0x3f, hnz)
sb += s
v = ((high & 0xf) << 2) | ((low >> 30) & 0x3)
hnz, s = base64_digit(v, hnz)
sb += s
hnz, s = base64_digit((low >> 24) & 0x3f, hnz)
sb += s
hnz, s = base64_digit((low >> 18) & 0x3f, hnz)
sb += s
hnz, s = base64_digit((low >> 12) & 0x3f, hnz)
sb += s
hnz, s = base64_digit((low >> 6) & 0x3f, hnz)
sb += s
hnz, s = base64_digit(low & 0x3f, hnz)
sb += s
return sb
def base64_digit(digit, have_non_zero):
if digit > 0:
have_non_zero = True
if have_non_zero:
if (digit < 26):
return (have_non_zero, chr(ord('A') + digit))
elif (digit < 52):
return (have_non_zero, chr(ord('a') + digit - 26))
elif (digit < 62):
return (have_non_zero, chr(ord('0') + digit - 52))
elif (digit == 62):
return (have_non_zero, '$')
else:
return (have_non_zero, '_')
return (have_non_zero, '')
def base64_value(char):
# Assume digit is one of [A-Za-z0-9$_]
if (char >= 'A' and char <= 'Z'):
return ord(char) - ord('A')
# No need to check digit <= 'z'
if (char >= 'a'):
return ord(char) - ord('a') + 26
if (char >= '0' and char <= '9'):
return ord(char) - ord('0') + 52
if (char == '$'):
return 62
# digit == '_'
return 63
assert long_to_base64(long_from_base64('U3Ay4uu')) == 'U3Ay4uu'
def dt_to_gwt_b64(dt):
"""
Convert a date to gwt base64 format (string)
"""
epoch = datetime.utcfromtimestamp(0)
ms_since_epoch = int((dt - epoch).total_seconds()*1000)
return long_to_base64(ms_since_epoch)
def gwt_b64_to_dt(s):
"""
Takes a gwt base64 format and makes it into a date
"""
ms_since_epoch = long_from_base64(s)
return datetime.utcfromtimestamp(ms_since_epoch/1000)
def parse_time_series(response):
"""
This is an attempt at reversing their format.
Not all works and sometimes it breaks unexpectedly.
"""
series = {
'start_time': gwt_b64_to_dt(response[2]),
'end_time': gwt_b64_to_dt(response[0])
}
timestamps = []
values = []
count = None
idx = 5
while True:
value = response[idx]
dtype = response[idx+1]
if dtype == 13:
values.append(value)
elif dtype == 8:
count = value
idx += 2
break
idx += 2
offset = 0
while True:
#print response[offset+idx:offset+idx+10]
value = response[offset+idx]
dtype = response[offset+idx+1]
if value == -3:
# XXX this appears to happen when a value is missing
if len(timestamps) > 2:
timestamps.append(timestamps[-1] + (timestamps[-2] - timestamps[-1]))
offset += 1
continue
if dtype != 3:
break
#assert dtype == 3, "dtype %s != 3" % dtype
dt = gwt_b64_to_dt(value)
if dt < series['start_time']:
break
#print(dt)
timestamps.append(dt)
offset += 2
series['count'] = count
series['timestamps'] = timestamps
series['values'] = values[:len(timestamps)]
return series
def get_time_ranges(start_time, end_time, days=7):
if end_time - start_time <= timedelta(days=days):
return [
(start_time, end_time)
]
time_ranges = []
range_start = start_time
range_end = range_start + timedelta(days=days)
while range_end <= end_time:
time_ranges.append((range_start, range_end))
range_start = range_end
range_end += timedelta(days=days)
if range_end > end_time:
range_end = end_time
time_ranges.append((range_start, range_end))
return time_ranges
GOOGLE_ALPHA_2_CODES = {'BD': 20, 'BE': 21, 'BF': 22, 'BG': 23, 'BA': 18, 'BB': 19, 'BM': 28, 'BN': 29,
'BO': 30, 'BH': 24, 'BI': 25, 'BJ': 26, 'BT': 34, 'JM': 123, 'BW': 37, 'WS': 262,
'BR': 32, 'BS': 33, 'JE': 122, 'BY': 38, 'BZ': 39, 'RU': 205, 'RW': 206, 'RS': 204,
'TL': 237, 'RE': 202, 'TM': 238, 'TJ': 235, 'RO': 203, 'GU': 102, 'GT': 101, 'GR': 99,
'GQ': 98, 'GP': 97, 'JP': 125, 'GY': 104, 'GG': 91, 'GF': 90, 'GE': 89, 'GD': 88, 'GB': 87,
'GA': 86, 'SV': 225, 'GN': 96, 'GM': 95, 'GL': 94, 'GI': 93, 'GH': 92, 'OM': 184, 'TN': 239,
'JO': 124, 'HR': 108, 'HT': 109, 'HU': 110, 'HK': 105, 'HN': 107, 'VE': 256, 'PR': 194,
'PS': 195, 'PW': 197, 'PT': 196, 'PY': 198, 'IQ': 118, 'PA': 185, 'PF': 187, 'PG': 188,
'PE': 186, 'PK': 190, 'PH': 189, 'PL': 191, 'PM': 192, 'ZM': 272, 'EE': 71, 'EG': 72,
'ZA': 271, 'EC': 70, 'IT': 121, 'VN': 259, 'SB': 208, 'ET': 76, 'ZW': 274, 'SA': 207,
'ES': 75, 'ME': 151, 'MD': 150, 'MG': 153, 'MA': 148, 'MC': 149, 'UZ': 253, 'MM': 157,
'ML': 156, 'MO': 159, 'MN': 158, 'MH': 154, 'MK': 155, 'MU': 165, 'MT': 164, 'MW': 167,
'MV': 166, 'MQ': 161, 'MP': 160, 'MS': 163, 'MR': 162, 'IM': 115, 'UG': 248, 'TZ': 246,
'MY': 169, 'MX': 168, 'IL': 114, 'FR': 84, 'FI': 79, 'FJ': 80, 'FO': 83, 'NI': 176,
'NL': 177, 'NO': 178, 'SO': 220, 'VU': 260, 'NC': 172, 'NE': 173, 'NF': 174, 'NG': 175,
'NZ': 183, 'NP': 179, 'NR': 180, 'CK': 47, 'CI': 46, 'CH': 45, 'CO': 51, 'CN': 50, 'CM': 49,
'CL': 48, 'CA': 40, 'CG': 44, 'CF': 43, 'CD': 42, 'CZ': 60, 'CY': 59, 'CR': 53, 'CV': 56,
'CU': 55, 'SZ': 228, 'SY': 227, 'KG': 127, 'KE': 126, 'SR': 221, 'KI': 129, 'KH': 128,
'KN': 131, 'ST': 223, 'SK': 216, 'KR': 133, 'SI': 214, 'SH': 213, 'KW': 134, 'SN': 219,
'SL': 217, 'SC': 209, 'KZ': 136, 'KY': 135, 'SG': 212, 'SE': 211, 'SD': 210, 'DO': 67,
'DM': 66, 'DJ': 64, 'DK': 65, 'VG': 257, 'DE': 62, 'YE': 268, 'DZ': 68, 'US': 251, 'UY': 252,
'YT': 269, 'LB': 138, 'LC': 139, 'LA': 137, 'TV': 244, 'TW': 245, 'TT': 243, 'TR': 242,
'LK': 141, 'LI': 140, 'LV': 146, 'TO': 240, 'LT': 144, 'LU': 145, 'LR': 142, 'LS': 143,
'TH': 234, 'TG': 233, 'TD': 231, 'TC': 230, 'LY': 147, 'VC': 255, 'AE': 2, 'AD': 1, 'AG': 4,
'AF': 3, 'AI': 5, 'VI': 258, 'IS': 120, 'IR': 119, 'AM': 7, 'AL': 6, 'AO': 9, 'AR': 11,
'AU': 14, 'AT': 13, 'AW': 15, 'IN': 116, 'AX': 16, 'AZ': 17, 'IE': 113, 'ID': 112, 'UA': 247,
'QA': 199, 'MZ': 170}
def get_code_from_alpha2(alpha_2):
try:
return GOOGLE_ALPHA_2_CODES[alpha_2]
except:
raise RuntimeError("Country not supported")
def get_metrics_with_date(prod_code, region_code, start_time, end_time):
print("Getting metrics for %s - %s" % (start_time, end_time))
headers = {
'x-client-data': 'CJG2yQEIprbJAQjBtskBCPucygEIqZ3KAQ==',
'x-gwt-module-base': 'https://www.google.com/transparencyreport/gwt/',
'x-gwt-permutation':'DFD0EBA544B633919D593657A1CFAC69',
'content-type': 'text/x-gwt-rpc; charset=UTF-8',
'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6'
}
data = ('7|0|11|https://www.google.com/transparencyreport/gwt/|A95F82F4A46F68F8F3518C8811783D00|'
'com.google.analysis.gblocked.traffic.frontend.shared.TrafficService|evaluateQuery|'
'com.google.analysis.gblocked.traffic.frontend.shared.TrafficRequest/1877719668|'
'com.google.analysis.gblocked.traffic.common.TimeSeries/3457781141|'
'com.google.analysis.gblocked.traffic.common.Logsource/3745169662|'
'com.google.i18n.identifiers.RegionCode/1527795405|'
'com.google.analysis.gblocked.traffic.frontend.shared.Zoom/2138134534|'
'com.google.analysis.gblocked.traffic.frontend.shared.Zoom$Source/3263501257|'
'java.util.Date/3385151746|'
'1|2|3|4|1|5|5|598|4|6|7|{prod_code}|8|{region_code}|9|10|1|11|{end_time}|11|{start_time}|'.format(
prod_code=prod_code,
region_code=region_code,
start_time=dt_to_gwt_b64(start_time),
end_time=dt_to_gwt_b64(end_time)
))
r = requests.post('https://www.google.com/transparencyreport/gwt/trafficService', headers=headers, data=data)
return r
def parse_response(data):
"""
Do our best to parse the response
"""
return json.loads(data[4:].replace("'", '"').replace('\\x', '\\u00'))
def get_data_for_product(region_code, prod_code, start_time, end_time):
timestamps = []
values = []
for st, et in get_time_ranges(start_time, end_time):
r = get_metrics_with_date(prod_code, region_code, st, et)
response = parse_response(r.text)
ts = parse_time_series(response)
for idx, timestamp in enumerate(ts['timestamps']):
if timestamp < start_time or timestamp > end_time:
continue
timestamps.append(timestamp)
values.append(ts['values'][idx])
return timestamps, values
Below are the functions you should actually be calling
GOOGLE_PRODUCT_CODES = {
'Blogger': 1,
'Gmail': 2,
'Google Books': 4,
'Google Docs': 5,
'Google Earth': 6,
'Google Groups': 7,
'Google Images': 8,
'Google Maps': 9,
'Google Search': 12,
'Google Sites': 13,
'Google Spreadsheets': 14,
'Google Translate': 16,
'Google Videos': 17,
'Orkut': 18,
'Picasa Web Albums': 19,
'Traffic Graph': 0,
'YouTube': 21
}
def get_df_traffic_for_country(alpha_2, start_time, end_time=datetime.utcnow()):
"""
This function takes as input a country code as alpha2 and returns a pandas
dataframe with all the traffic data for it.
"""
result = {
'timestamps': None
}
region_code = get_code_from_alpha2(alpha_2)
for prod_name, prod_code in GOOGLE_PRODUCT_CODES.items():
if prod_code not in [2, 21, 9, 8, 12]:
# We only care about: Gmail, Youtube, Maps, Images, Search
continue
print("Getting %s - %s" % (prod_name, alpha_2))
try:
timestamps, values = get_data_for_product(region_code, prod_code, start_time, end_time)
if result['timestamps']:
assert result['timestamps'] == timestamps
else:
result['timestamps'] = timestamps
except Exception as exc:
print("MISSING DATA")
print(exc)
continue
result[prod_name] = values
df = pd.DataFrame(result)
df.drop_duplicates(subset='timestamps', keep='last', inplace=True)
df.set_index('timestamps', inplace=True)
return df.sort_index()
df_et = get_df_traffic_for_country('ET', datetime(2016, 6, 1, 0, 0), datetime(2016, 8, 30, 0, 0))
We use as a metric google search, that seems to be the most stationary metric for every country (apart from seasonality variations)
target_metric = 'Google Search'
df = df_et.reset_index()[['Google Search', 'timestamps']]
df.columns = ['y', 'ds']
df.head()
The models require the following extra dependencies:
For the Facebook Prophet based model:
For the ARIMA model:
This first model uses fbprophet an "Automatic Forecasting Procedure" developped by facebook. The paper to be used as reference is: https://facebookincubator.github.io/prophet/static/prophet_paper_20170113.pdf.
In order to support daily periods you will have to use my branch of it available here: https://github.com/hellais/prophet/tree/feature/daily-seasonality
from fbprophet import Prophet
pm = Prophet(daily_seasonality=True)
pm.fit(df)
predictions = pm.predict(df[['ds', 'y']])
predictions.set_index('ds')[['y', 'yhat', 'yhat_lower', 'yhat_upper']].iplot()
fbp_model = predictions.set_index('ds')[['y', 'yhat', 'yhat_lower', 'yhat_upper']]
fbp_model['anomaly_factor'] = ((fbp_model['yhat'] - fbp_model['y'])/100)
layout_fbp = {'shapes': []}
anomalies = []
anomaly_start = None
trigger_factor = 0.4
for dt, row in fbp_model.iterrows():
if row.anomaly_factor and row.anomaly_factor > trigger_factor:
if anomaly_start is None:
anomaly_start = dt
elif anomaly_start is not None:
anomalies.append((anomaly_start, dt))
anomaly_start = None
for start, end in anomalies:
layout_fbp['shapes'].append({
'type': 'rect',
'xref': 'x',
'yref': 'paper',
'x0': start,
'y0': 0,
'x1': end,
'y1': 1,
'fillcolor': 'red',
'opacity': 0.5,
'line': {
'width': 0
}
})
fbp_model.iplot(layout=layout_fbp)
We build a "Seasonal AutoRegressive Integrated Moving Average with eXogenous regressors model" and test the actual value to the value of the model.
We use as s to seasonal_order
48 since the period of the data is 48 (48 ticks per day).
arimam = sm.tsa.SARIMAX(df.set_index('ds')['y'], order=(1,0,0), seasonal_order=(1,1,0,48)).fit()
arima_model = df.set_index('ds')
start_time = (arima_model.index[0] + timedelta(days=1)).strftime('%Y-%m-%d')
end_time = arima_model.index[-1].strftime('%Y-%m-%d')
arima_model['ypred'] = arimam.predict(start_time, end_time, dynamic=True)
If we see an anomaly_factor
that is greater than 0.4 (positive means it's a downrise), we will color that region red indicating a possible blackout event.
arima_model['anomaly_factor'] = ((arima_model['ypred'] - arima_model['y'])/100)
layout_arima = {'shapes': []}
anomalies = []
anomaly_start = None
trigger_factor = 0.4
for dt, row in arima_model.iterrows():
if row.anomaly_factor and row.anomaly_factor > trigger_factor:
if anomaly_start is None:
anomaly_start = dt
elif anomaly_start is not None:
anomalies.append((anomaly_start, dt))
anomaly_start = None
for start, end in anomalies:
layout_arima['shapes'].append({
'type': 'rect',
'xref': 'x',
'yref': 'paper',
'x0': start,
'y0': 0,
'x1': end,
'y1': 1,
'fillcolor': 'red',
'opacity': 0.5,
'line': {
'width': 0
}
})
Plot it!
arima_model.iplot(layout=layout_arima)