Research talk:Reading time/Work log/2018-10-09

Tuesday, October 9, 2018

Model Selection

I ran into problems taking stratified samples using Hive. The approach I was using relied on doing a `ORDER BY RAND()` which is slow. Instead now I'm using the `sampleBy` function is Spark.

 
max_size = 20000
min_size = 300
r = spark.read
dt = r.table("nathante.cleanReadingDepth")
n_by_wiki = dt.groupBy('wiki').count().toPandas()
ge_max_size =  n_by_wiki['count'] >= max_size
le_min_size =  n_by_wiki['count'] < min_size
n_by_wiki.loc[ge_max_size == True, "samp_prop"] = max_size / n_by_wiki['count']
n_by_wiki.loc[ge_max_size == False, "samp_prop"] = 1
n_by_wiki.loc[le_min_size == True, "samp_prop"] = 0
props = n_by_wiki.set_index('wiki').to_dict(orient='dict')['samp_prop']
strat_samp = dt.sampleBy(col='wiki',fractions=props)
strat_samp.write.saveAsTable("nathante.samp_cleanReadingData",mode='overwrite',partitionBy=['year','month'])

Using the stratified sample, I fit each of the distributions and compute goodness-of-fit statistics.

# fit a distribution for each wiki
import itertools as it

def select_model(dt_all):

    time_threshhold = np.inf

    models = [stats.weibull_min, stats.exponweib, stats.lognorm, stats.gamma, stats.expon]
    dfs = {mod.name:df for mod,df in zip(models,[2,3,2,2,1])}
    res = {}
    for wiki in set(dt_all.wiki):

        dtwiki = dt_all.loc[(dt_all.wiki==wiki) &
                            (~dt_all.visiblelength.isnull()) &
                            (dt_all.visiblelength>0) &
                            (dt_all.visiblelength<=time_threshhold),["visiblelength"]]
        trainassign = np.random.binomial(1,0.75,len(dtwiki))
        
        dtwiki_train = dtwiki.iloc[trainassign==1]
        dtwiki_test = dtwiki.iloc[trainassign==0]
        
        fit_models = [m.fit(dtwiki_train,floc=0) for m in models]
        
        ks_tests = [stats.kstest(dtwiki_test,mod.name,fit) for mod,fit in zip(models,fit_models)]
        
        logliks = []
        AICs = []
        BICs = []
        for mod,fit in zip(models,fit_models):
            print(fit)
            loglik = mod.logpdf(dtwiki_test,*fit).sum()
            logliks.append(loglik)
            AICs.append(2 * dfs[mod.name] - 2 * loglik)
            BICs.append(2 * dfs[mod.name] - np.log(len(dtwiki_test)) * loglik)
            
        print(logliks)
        logliks = np.array(logliks)
            
        print(logliks)
        lrs = []
            
            # these are chi-square tests with appropriate df
        lrtests = {}
        for pair in list(it.combinations(zip(models,logliks),2)):
            df = dfs[pair[0][0].name] - dfs[pair[1][0].name]
            if df < 0: 
                df = -df
                a = pair[1]
                b = pair[0]
            else:
                a = pair[0]
                b = pair[1]
            k = "{0}:{1}".format(a[0].name, b[0].name)
            # if lr > 0 then a is more likely than b
            lr = np.exp(a[1] - b[1])
            lrs.append((k,lr))
            pvalue = stats.chi2.sf(-2*(a[1] - b[1]),df=1+df)
            # it's the p-value that 
            lrtests[k] = pvalue
        print(len(models))
        print(len(ks_tests))
        res[wiki] = [pd.DataFrame({"model" : [mod.name for mod in models],
                                   "ks" : [ks.statistic for ks in ks_tests],
                                   "loglik" : logliks,
                                   "AIC" : AICs,
                                   "BIC" : BICs}),
                     lrtests]
    return res

# we need to use a stratified sample to look at smaller wikis like pa and hi
hive_cursor.execute("SELECT event.totalLength AS totalLength, event.VisibleLength AS visibleLength, wiki FROM nathante.samp_cleanReadingData WHERE year > 0 LIMIT 10000000")
# connect to Hive
dt = as_pandas(hive_cursor)

dt.visiblelength = dt.visiblelength/1000
dt.totallength = dt.totallength/1000

res = select_model(dt)

Now we are ready to find out how well the models fit for each wiki.

# make it pretty 
for r in res.keys():
    res[r][0]['wiki'] = r

table = pd.concat([res[r][0] for r in res.keys()]).reset_index()
table = table.drop('index',1)
table = table[[5,4,0,1,2,3]].reset_index()


gb = table.groupby(['wiki'])['AIC','BIC'].rank().reset_index()
gb['AIC_rank'] = gb['AIC']
gb['BIC_rank'] = gb['BIC']
gb.drop(['AIC','BIC'],1)
table = table.merge(gb, on='index')
table.groupby(['model'])['AIC_rank','BIC_rank'].agg(['mean','median'])

Results

	AIC_rank		BIC_rank
	mean	median	mean	median
model
expon	4.950413	5.0	4.962810	5.0
exponweib	1.483471	1.0	1.396694	1.0
gamma	4.008264	4.0	4.004132	4.0
lognorm	1.566116	2.0	1.644628	2.0
weibull_min	2.991736	3.0	2.991736	3.0

So we can see quite clearly that an exponentiated Weibull model is the best model for the data according to AIC and BIC. The lognormal is the next best overall and is actually frequently the best model overall. The Weibull model is sometimes, but rarely ranked better than 3.