Loto code¶

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import datetime
%matplotlib inline

from scipy.stats import chisquare
from scipy.stats import anderson

Monte Carlo computation of the chi2 test on a dice¶

def role_dice(n):
    out = np.random.randint(1,7,n)
    exp = float(n)/6 # we expect having each nmumber n/6 times
    obs = [(out==i).sum() for i in np.arange(1,7)] # true observations
    T = [(obs[i]-exp)**2/exp for i in np.arange(6)]
    T = np.sum(T)
    return T

n_exp = 10000
n_simul = 1000
T = []
for n in np.arange(n_exp):
    T.extend([role_dice(n_simul)])

print(np.percentile(T,95))

11.06

Let's play with the lottery data¶

data = pd.read_csv('/home/romain/Documents/Github/loto/data/nouveau_loto.csv',sep=';',index_col=False)
del data['Unnamed: 25'] # the file end each line with a semi-colon which causes an empty column
print(data.shape)

(1152, 25)

data.head()

Application of the chi2 test on the lottery data - Does some numbers get out more often ?¶

Classic numbers¶

boule = pd.concat([data['boule_1'],data['boule_2'],data['boule_3'],data['boule_4'],data['boule_5']])
freq = boule.value_counts()
plt.figure(figsize=(12,6))
plt.bar(list(freq.index),freq)

<Container object of 49 artists>

Is this odd ?

To check it, we will use the chi-2 test with 48 degree of freedom that the observation come from a multionomial law

chisquare(freq)

Power_divergenceResult(statistic=45.870486111111113, pvalue=0.56051711176708841)

The p-value is really high, so we do not reject H0 and thus there is nothing abnormal there

The 'numero chance'¶

freq_chance = data['numero_chance'].value_counts()
plt.figure(figsize=(12,6))
plt.bar(list(freq_chance.index),freq_chance)
plt.xticks(np.arange(1,11)+0.5,
                   ["numero chance "+str(i+1) for i in np.arange(10)],rotation='vertical')

([<matplotlib.axis.XTick at 0x7fb514553c88>,
  <matplotlib.axis.XTick at 0x7fb514c10898>,
  <matplotlib.axis.XTick at 0x7fb51453bba8>,
  <matplotlib.axis.XTick at 0x7fb514b28da0>,
  <matplotlib.axis.XTick at 0x7fb514b2e7f0>,
  <matplotlib.axis.XTick at 0x7fb514b31240>,
  <matplotlib.axis.XTick at 0x7fb514b31c50>,
  <matplotlib.axis.XTick at 0x7fb514b356a0>,
  <matplotlib.axis.XTick at 0x7fb514b390f0>,
  <matplotlib.axis.XTick at 0x7fb514b39b00>],
 <a list of 10 Text xticklabel objects>)

chisquare(freq_chance)

Power_divergenceResult(statistic=17.270833333333332, pvalue=0.044639864855720156)

This one is a little more litigous ... Most of the time we want a p-value greater than 0.05 ... It may be a good idea to choose 1 as a 'numero chance'

What about pairs ?¶

def search(x,i):
    a = re.search(r"(^|-)"+str(i)+"(-|\+)",x)
    if a:
        out = True
    else:
        out = False
    return out

pairs = np.zeros((49,49))
for i in np.arange(1,50):
    ind_i = list(map(lambda x: search(x,i),data['combinaison_gagnante_en_ordre_croissant']))
    subdata_i = data.ix[ind_i]
    for j in np.arange(i+1,50):
        ind_ij = list(map(lambda x: search(x,j),subdata_i['combinaison_gagnante_en_ordre_croissant']))
        pairs[i-1,j-1] = np.sum(ind_ij)
        pairs[j-1,i-1] = np.sum(ind_ij)

pairs = pd.DataFrame(pairs)
plt.style.use('ggplot')
plt.figure(figsize=(15,10))
plt.title('Pairs heatmap')
plt.pcolor(pairs,cmap=plt.cm.Reds)
width = .5
height = .5
plt.xticks(np.arange(pairs.shape[0])+width,
           ["boule "+str(i+1) for i in np.arange(pairs.shape[1])],rotation='vertical')
plt.yticks(np.arange(pairs.shape[0])+width,
           ["boule "+str(i+1) for i in np.arange(pairs.shape[1])])

([<matplotlib.axis.YTick at 0x7fb514afb358>,
  <matplotlib.axis.YTick at 0x7fb5145531d0>,
  <matplotlib.axis.YTick at 0x7fb5144eec88>,
  <matplotlib.axis.YTick at 0x7fb5144dc668>,
  <matplotlib.axis.YTick at 0x7fb514a17908>,
  <matplotlib.axis.YTick at 0x7fb514a09e80>,
  <matplotlib.axis.YTick at 0x7fb5149fc588>,
  <matplotlib.axis.YTick at 0x7fb5149f0550>,
  <matplotlib.axis.YTick at 0x7fb5149dfb38>,
  <matplotlib.axis.YTick at 0x7fb514a550b8>,
  <matplotlib.axis.YTick at 0x7fb514affeb8>,
  <matplotlib.axis.YTick at 0x7fb5144f9208>,
  <matplotlib.axis.YTick at 0x7fb5145017f0>,
  <matplotlib.axis.YTick at 0x7fb514506240>,
  <matplotlib.axis.YTick at 0x7fb514506c50>,
  <matplotlib.axis.YTick at 0x7fb5145096a0>,
  <matplotlib.axis.YTick at 0x7fb51450d0f0>,
  <matplotlib.axis.YTick at 0x7fb51450db00>,
  <matplotlib.axis.YTick at 0x7fb514512550>,
  <matplotlib.axis.YTick at 0x7fb514512f60>,
  <matplotlib.axis.YTick at 0x7fb5145159b0>,
  <matplotlib.axis.YTick at 0x7fb51449a400>,
  <matplotlib.axis.YTick at 0x7fb5144f1198>,
  <matplotlib.axis.YTick at 0x7fb5144e9080>,
  <matplotlib.axis.YTick at 0x7fb5144dc828>,
  <matplotlib.axis.YTick at 0x7fb514a1c160>,
  <matplotlib.axis.YTick at 0x7fb514a10dd8>,
  <matplotlib.axis.YTick at 0x7fb514a097f0>,
  <matplotlib.axis.YTick at 0x7fb514a00128>,
  <matplotlib.axis.YTick at 0x7fb5149f5f60>,
  <matplotlib.axis.YTick at 0x7fb5149eb860>,
  <matplotlib.axis.YTick at 0x7fb5149e4160>,
  <matplotlib.axis.YTick at 0x7fb514a57748>,
  <matplotlib.axis.YTick at 0x7fb514a517b8>,
  <matplotlib.axis.YTick at 0x7fb51449ab70>,
  <matplotlib.axis.YTick at 0x7fb51449d3c8>,
  <matplotlib.axis.YTick at 0x7fb51449dbe0>,
  <matplotlib.axis.YTick at 0x7fb5144a04a8>,
  <matplotlib.axis.YTick at 0x7fb5144a0eb8>,
  <matplotlib.axis.YTick at 0x7fb5144a3908>,
  <matplotlib.axis.YTick at 0x7fb5144a6358>,
  <matplotlib.axis.YTick at 0x7fb5144a6d68>,
  <matplotlib.axis.YTick at 0x7fb5144a87b8>,
  <matplotlib.axis.YTick at 0x7fb5144ad208>,
  <matplotlib.axis.YTick at 0x7fb5144adc18>,
  <matplotlib.axis.YTick at 0x7fb5144b1668>,
  <matplotlib.axis.YTick at 0x7fb5144b40b8>,
  <matplotlib.axis.YTick at 0x7fb5144b4ac8>,
  <matplotlib.axis.YTick at 0x7fb5144b8518>],
 <a list of 49 Text yticklabel objects>)

ind = np.triu_indices(49,1)
chisquare(np.array(pairs)[ind])

Power_divergenceResult(statistic=1224.4916666666668, pvalue=0.15371052190725559)

Again, the p-value is really high, so we do not reject H0 and thus there is nothing abnormal there. Too bad for us :)

On the winning probability¶

Over the years¶

data['date'] = list(map(lambda x: datetime.datetime.strptime(x,'%d/%m/%Y').date(),data['date_de_tirage']))

f, axarr = plt.subplots(3, 2,figsize=(15,15))
for i in np.arange(6):
    r = int(i/2)
    c = i%2
    axarr[r,c].plot(data['date'],data['nombre_de_gagnant_au_rang'+str(i+1)])
    axarr[r,c].set_title('number of winner with '+str(6-i)+' correct numbers')
plt.show()

There does not seem to be a winning trend over the years. Now is not better than before !

The day of the week ?¶

f, axarr = plt.subplots(3, 2,figsize=(15,15))
days = list(set(data['jour_de_tirage']))
for i in np.arange(6):
    r = int(i/2)
    c = i%2
    tab = [np.mean(data['nombre_de_gagnant_au_rang'+str(i+1)][data['jour_de_tirage']==days[k]]) 
           for k in np.arange(len(days))]
    axarr[r,c].scatter(np.arange(len(days)),tab)
    axarr[r,c].set_title('number of winner with '+str(6-i)+' correct numbers')
    axarr[r,c].set_xticklabels(['','Monday','','Wednesday','','Saturday',''])
plt.show()

Saturday seems to be a good day to play ! Or not ... because if there is more winner, you will win less. What would be more interesting is the proportion of winner, unfortunately we do not have the number of opponents. But with the randomness hypothesis tested so far, we can infer that the game is really random, so your winning probability won't change with the day of the week, but the money you will earn may be bigger on Mondays than on Saturdays !

days = list(set(data['jour_de_tirage']))
uplift = []
for i in np.arange(6):
    monday = np.mean(data['nombre_de_gagnant_au_rang'+str(i+1)][data['jour_de_tirage']==days[0]]) 
    restOfWeek = np.mean(data['nombre_de_gagnant_au_rang'+str(i+1)][data['jour_de_tirage']!=days[0]]) 
    uplift.extend([(restOfWeek-monday)/restOfWeek*100])
print(np.mean(uplift))

-55.9805515698

The month¶

data['month'] = list(map(lambda dt: dt.month,data['date']))
f, axarr = plt.subplots(3, 2,figsize=(15,15))
for i in np.arange(6):
    r = int(i/2)
    c = i%2
    tab = [np.mean(data['nombre_de_gagnant_au_rang'+str(i+1)][data['month']==k]) 
           for k in np.arange(1,13)]
    axarr[r,c].scatter(np.arange(1,13),tab)
    axarr[r,c].set_title('number of winner with '+str(6-i)+' correct numbers')
    axarr[r,c].set_xlim([0.5,12.5])
plt.show()

It seems that there is less opponents during summer. Just sayin' ...

uplift = []
for i in np.arange(6):
    june = np.mean(data['nombre_de_gagnant_au_rang'+str(i+1)][(data['month']==6)]) 
    restOfYear = np.mean(data['nombre_de_gagnant_au_rang'+str(i+1)][(data['month']!=6)]) 
    uplift.extend([(restOfYear-june)/restOfYear*100])
print(np.mean(uplift))

9.21154412795

	annee_numero_de_tirage	jour_de_tirage	date_de_tirage	date_de_forclusion	boule_1	boule_2	boule_3	boule_4	boule_5	numero_chance	...	nombre_de_gagnant_au_rang3	rapport_du_rang3	nombre_de_gagnant_au_rang4	rapport_du_rang4	nombre_de_gagnant_au_rang5	rapport_du_rang5	nombre_de_gagnant_au_rang6	rapport_du_rang6	numero_jokerplus	devise
0	2016019	SAMEDI	13/02/2016	14/04/2016	38	30	27	41	10	4	...	971	1130,9	43256	11	634259	5,3	910674	2	1 678 529	eur
1	2016018	MERCREDI	10/02/2016	11/04/2016	44	46	1	27	48	10	...	285	1813,1	16519	13,5	258725	6,1	338890	2	5 758 628	eur
2	2016017	LUNDI	08/02/2016	09/04/2016	30	1	28	39	11	6	...	345	1662,4	17037	9,9	244222	4,9	348441	2	3 908 634	eur
3	2016016	SAMEDI	06/02/2016	07/04/2016	44	25	2	41	28	9	...	520	1341	24482	12,3	379330	5,6	551760	2	3 328 398	eur
4	2016015	MERCREDI	03/02/2016	04/04/2016	5	44	24	28	25	3	...	565	1100,9	28453	9,4	400389	4,8	538287	2	9 178 211	eur