Do Small Training Yards Punch Above Their Weight ? : R Code

####################################################################
#
# R Code to accompany the blog post ‘Do small Training Yards Punch Above Their Weight?

#

# SmallTrainerAnalysis.R
#
# J. Hathorn
#
# v1.0
#
#
# Code to look at whether small yards punch above their weight
#
# written 10-Sep-13
#
#
###################################################################

#rm(list=ls())

library(foreign)
library(maptools)

# read in database files from RI
#
setwd(“C:/Program Files (x86)/RaceForm Interactive”)

RIhorse.data <-read.dbf(“horse.dbf”)
RIouting.data <-read.dbf(“outing.dbf”)
RIrace.data <-read.dbf(“race.dbf”)
RIsire.data <-read.dbf(“sire.dbf”)
RItrainer.data <- read.dbf(“trainer.dbf”)
RIcourse.data<-read.dbf(“course.dbf”)

# #############################################################
# set date parameters to focus on races between chosen dates
# flat season Lincoln to the November Handicap
chosenDateSt<-c(“2012-03-31”)
chosenDateEd<-c(“2012-11-10”)

# set dates for determining yard sizes, set the previous year to the November Handicap
chosenDateSt1<-c(“2011-11-11”)
chosenDateEd1<-c(“2012-11-10”)
#################################################
#
# extract GB course id list from course db
z<-which(RIcourse.data$CCOUNTRY == “GB”)

GBcourseids<-RIcourse.data$CID[z]
GBcoursenames<-RIcourse.data$CNAME[z]
#
# extract GB/IRE trainer lists from trainer db
z<-which(RItrainer.data$TCOUNTRY == “GB”)
GBtrainers<-RItrainer.data$TID[z]
z<-which(RItrainer.data$TCOUNTRY == “IRE”)
IREtrainers<-RItrainer.data$TID[z]
GBIREtrainers<-append(GBtrainers,IREtrainers)
#
#
##################################################

# select outings on the flat between the chosen dates to categorise trainers
tmpidx<-which(RIouting.data$ODATE>=chosenDateSt1 & RIouting.data$ODATE<=chosenDateEd1)
T1.data<-RIouting.data[tmpidx,]

# match the course and add a country variable
z<-match(T1.data$OCOURSEID,RIcourse.data$CID)
T1.data$COCOUNTRY<-NA
T1.data$COCOUNTRY<-RIcourse.data$CCOUNTRY[z]

T1a.data<-T1.data
# select outings that took place on GB and IRE courses and append the 2 d f
tmpidx<-which(T1.data$COCOUNTRY == “GB”)
T1.data<-T1.data[tmpidx,]

tmpidx<-which(T1a.data$COCOUNTRY == “IRE”)
T2.data<-T1a.data[tmpidx,]

T1.data<-rbind(T1.data,T2.data)
# put the horse and trainer IDs into unique variables and add trainer name and country and horse name
horse<-T1.data$OHORSEID
trainer<-T1.data$OTRAINERID
uhorse<-unique(horse)
z<-match(uhorse,horse)
utrainer<-trainer[z]

# match the trainer name/domicile set up domestic/foreign variable
z<-match(utrainer,RItrainer.data$TID)
utrainerhome<-RItrainer.data$TCOUNTRY[z]
utrainername<-RItrainer.data$TSTYLENAME[z]
domestic<-0
z<-which(utrainerhome==”GB”)
domestic[z]<-1
z<-which(utrainerhome==”IRE”)
domestic[z]<-1
z<-which(is.na(domestic))
domestic[z]<-0

# match the horse name and calculate the horse age
z<-match(uhorse,RIhorse.data$HID)
uhorsename<-RIhorse.data$HNAME[z]
uhorsefdate<-RIhorse.data$HFOALDATE[z]
uhorseage<-(as.Date(chosenDateEd)-uhorsefdate)/365
# aggregate to num horses by trainer
trainerid<-tapply(utrainer,utrainer,mean)
numhorses<-tapply(uhorse,utrainer,length)
z<-match(trainerid,RItrainer.data$TID)
trainerctry<-RItrainer.data$TCOUNTRY[z]
trainerdomestic<-0
z<-which(trainerctry==”GB”)
trainerdomestic[z]<-1
z<-which(trainerctry==”IRE”)
trainerdomestic[z]<-1
z<-which(is.na(trainerdomestic))
trainerdomestic[z]<-0
# allocate trainers to overseas/tiny/small/medium/large categories in the variable size
tiny<-5
sml<-25
med<-75
size<-NA

t1<-which(trainerdomestic==0)
size[t1]<-“OVS”

t1<-which(trainerdomestic==1 & numhorses<= tiny)
size[t1]<-“TINY”
t1<-which(trainerdomestic==1 & numhorses>tiny & numhorses <=sml)
size[t1]<-“SMALL”
t1<-which(trainerdomestic==1 & numhorses>sml & numhorses <=med)
size[t1]<-“MEDIUM”
t1<-which(trainerdomestic==1 & numhorses>med)
size[t1]<-“LARGE”

size<-as.factor(size)
# total/avg horses by size category
horsesbycat<-tapply(numhorses,size,sum)
avgbycat<-tapply(numhorses,size,mean,na.rm=TRUE)

# match the size by trainer back to the unique horse vector, will be useful later
#
z<-match(utrainer,trainerid)
utrainersize<-size[z]

###############################################
#
# go to the race file and calculate how many GB handicap and pattern races and runners in the period examined
#
tmpidx<-which(RIrace.data$RDATE>=chosenDateSt & RIrace.data$RDATE<=chosenDateEd & RIrace.data$RFJ==”F”)
R1.data<-RIrace.data[tmpidx,]

z<-match(R1.data$RCOURSEID,RIcourse.data$CID)
R1.data$COCOUNTRY<-NA
R1.data$COCOUNTRY<-RIcourse.data$CCOUNTRY[z]

# select outings that took place on GB courses
tmpidx<-which(R1.data$COCOUNTRY == “GB”)
R1.data<-R1.data[tmpidx,]

# match the winning horse into uhorse to find trainer id/name, trainer home and size category
z<-match(R1.data$RWINHRSID,uhorse)
R1.data$trainerid<-utrainer[z]
R1.data$trainername<-utrainername[z]
R1.data$trainersize<-utrainersize[z]
R1.data$trainerhome<-utrainerhome[z]
R1.data$horseage<-uhorseage[z]

# set up 1/0 values for aggregation later by trainer category
z<-which(R1.data$trainersize==”OVS”)
R1.data$ovs<-NA
R1.data$ovs[z]<-1
z<-which(is.na(R1.data$ovs))
R1.data$ovs[z]<-0

z<-which(R1.data$trainersize==”TINY”)
R1.data$tiny<-NA
R1.data$tiny[z]<-1
z<-which(is.na(R1.data$tiny))
R1.data$tiny[z]<-0

z<-which(R1.data$trainersize==”SMALL”)
R1.data$sml<-NA
R1.data$sml[z]<-1
z<-which(is.na(R1.data$sml))
R1.data$sml[z]<-0

z<-which(R1.data$trainersize==”MEDIUM”)
R1.data$med<-NA
R1.data$med[z]<-1
z<-which(is.na(R1.data$med))
R1.data$med[z]<-0

z<-which(R1.data$trainersize==”LARGE”)
R1.data$lge<-NA
R1.data$lge[z]<-1
z<-which(is.na(R1.data$lge))
R1.data$lge[z]<-0
# match the winning horse into the RIhorse d f to get the sire id
z<-match(R1.data$RWINHRSID,RIhorse.data$HID)
R1.data$WINSIREID<-RIhorse.data$HSIREID[z]

# produce pattern only and handicap only d f
#
z<-which(R1.data$RPATTERN != “NOT” & R1.data$RISHCAP==”FALSE”)
Patterns<-R1.data[z,]

z<-which(R1.data$RISHCAP== “TRUE”)
Hcaps<-R1.data[z,]

# summarise winners for patterns/hcaps by sire ID in total and by trainer category
#
PatternWinsSireIDTmp<-tapply(Patterns$WINSIREID,Patterns$WINSIREID,mean)
PatternWinsSireTmp<-tapply(Patterns$WINSIREID,Patterns$WINSIREID,length)
PatternWinsSireOvsTmp<-tapply(Patterns$ovs,Patterns$WINSIREID,sum)
PatternWinsSireTinyTmp<-tapply(Patterns$tiny,Patterns$WINSIREID,sum)
PatternWinsSireSmlTmp<-tapply(Patterns$sml,Patterns$WINSIREID,sum)
PatternWinsSireMedTmp<-tapply(Patterns$med,Patterns$WINSIREID,sum)
PatternWinsSireLgeTmp<-tapply(Patterns$lge,Patterns$WINSIREID,sum)

HcapWinsSireIDTmp<-tapply(Hcaps$WINSIREID,Hcaps$WINSIREID,mean)
HcapWinsSireTmp<-tapply(Hcaps$WINSIREID,Hcaps$WINSIREID,length)
HcapWinsSireOvsTmp<-tapply(Hcaps$ovs,Hcaps$WINSIREID,sum)
HcapWinsSireTinyTmp<-tapply(Hcaps$tiny,Hcaps$WINSIREID,sum)
HcapWinsSireSmlTmp<-tapply(Hcaps$sml,Hcaps$WINSIREID,sum)
HcapWinsSireMedTmp<-tapply(Hcaps$med,Hcaps$WINSIREID,sum)
HcapWinsSireLgeTmp<-tapply(Hcaps$lge,Hcaps$WINSIREID,sum)

# summarise winners for patterns/hcaps by trainer size
PatternWinsTrainers<-tapply(Patterns$trainersize,Patterns$trainersize,length)
HcapWinsTrainers<-tapply(Hcaps$trainersize,Hcaps$trainersize,length)

#########################
#
# find out number of runs by category for each race type
#
z<-match(T1.data$OTRAINERID,utrainer)
T1.data$trainername<-utrainername[z]
T1.data$trainersize<-utrainersize[z]
T1.data$trainerhome<-utrainerhome[z]
T1.data$horseage<-uhorseage[z]

# set up 1/0 values for aggregation later by trainer category
z<-which(T1.data$trainersize==”OVS”)
T1.data$ovs<-NA
T1.data$ovs[z]<-1
z<-which(is.na(T1.data$ovs))
T1.data$ovs[z]<-0

z<-which(T1.data$trainersize==”TINY”)
T1.data$tiny<-NA
T1.data$tiny[z]<-1
z<-which(is.na(T1.data$tiny))
T1.data$tiny[z]<-0

z<-which(T1.data$trainersize==”SMALL”)
T1.data$sml<-NA
T1.data$sml[z]<-1
z<-which(is.na(T1.data$sml))
T1.data$sml[z]<-0

z<-which(T1.data$trainersize==”MEDIUM”)
T1.data$med<-NA
T1.data$med[z]<-1
z<-which(is.na(T1.data$med))
T1.data$med[z]<-0

z<-which(T1.data$trainersize==”LARGE”)
T1.data$lge<-NA
T1.data$lge[z]<-1
z<-which(is.na(T1.data$lge))
T1.data$lge[z]<-0

# bring in race types

z<-match(T1.data$ORACEID,R1.data$RID)
T1.data$RPATTERN<-R1.data$RPATTERN[z]
T1.data$RISHCAP<-R1.data$RISHCAP[z]

# match each horse into the RIhorse d f to get the sire id
z<-match(T1.data$OHORSEID,RIhorse.data$HID)
T1.data$SIREID<-RIhorse.data$HSIREID[z]

# produce pattern only and handicap only outing d f
#
z<-which(T1.data$RPATTERN != “NOT” & T1.data$RISHCAP==”FALSE”)
OutPatterns<-T1.data[z,]

z<-which(T1.data$RISHCAP== “TRUE”)
OutHcaps<-T1.data[z,]

# summarise runners for patterns/hcaps by trainer size
PatternRunsTrainers<-tapply(OutPatterns$trainersize,OutPatterns$trainersize,length)
HcapRunsTrainers<-tapply(OutHcaps$trainersize,OutHcaps$trainersize,length)

# summarise runners for patterns/hcaps by sire ID
PatternRunsSireID<-tapply(OutPatterns$SIREID,OutPatterns$SIREID,mean)
PatternRunsSire<-tapply(OutPatterns$SIREID,OutPatterns$SIREID,length)
PatternRunsSireOvs<-tapply(OutPatterns$ovs,OutPatterns$SIREID,sum)
PatternRunsSireTiny<-tapply(OutPatterns$tiny,OutPatterns$SIREID,sum)
PatternRunsSireSml<-tapply(OutPatterns$sml,OutPatterns$SIREID,sum)
PatternRunsSireMed<-tapply(OutPatterns$med,OutPatterns$SIREID,sum)
PatternRunsSireLge<-tapply(OutPatterns$lge,OutPatterns$SIREID,sum)

HcapRunsSireID<-tapply(OutHcaps$SIREID,OutHcaps$SIREID,mean)
HcapRunsSire<-tapply(OutHcaps$SIREID,OutHcaps$SIREID,length)
HcapRunsSireOvs<-tapply(OutHcaps$ovs,OutHcaps$SIREID,sum)
HcapRunsSireTiny<-tapply(OutHcaps$tiny,OutHcaps$SIREID,sum)
HcapRunsSireSml<-tapply(OutHcaps$sml,OutHcaps$SIREID,sum)
HcapRunsSireMed<-tapply(OutHcaps$med,OutHcaps$SIREID,sum)
HcapRunsSireLge<-tapply(OutHcaps$lge,OutHcaps$SIREID,sum)

# calc the average age of the horses run by each trainer category in handicaps
#
HcapAge.ovs<-sum(OutHcaps$ovs*as.numeric(OutHcaps$horseage),na.rm=TRUE)/sum(as.numeric(OutHcaps$ovs),na.rm=TRUE)
HcapAge.tiny<-sum(OutHcaps$tiny*as.numeric(OutHcaps$horseage),na.rm=TRUE)/sum(as.numeric(OutHcaps$tiny),na.rm=TRUE)
HcapAge.sml<-sum(OutHcaps$sml*as.numeric(OutHcaps$horseage),na.rm=TRUE)/sum(as.numeric(OutHcaps$sml),na.rm=TRUE)
HcapAge.med<-sum(OutHcaps$med*as.numeric(OutHcaps$horseage),na.rm=TRUE)/sum(as.numeric(OutHcaps$med),na.rm=TRUE)
HcapAge.lge<-sum(OutHcaps$lge*as.numeric(OutHcaps$horseage),na.rm=TRUE)/sum(as.numeric(OutHcaps$lge),na.rm=TRUE)
#
PtrnAge.ovs<-sum(OutPatterns$ovs*as.numeric(OutPatterns$horseage),na.rm=TRUE)/sum(as.numeric(OutPatterns$ovs),na.rm=TRUE)
PtrnAge.tiny<-sum(OutPatterns$tiny*as.numeric(OutPatterns$horseage),na.rm=TRUE)/sum(as.numeric(OutPatterns$tiny),na.rm=TRUE)
PtrnAge.sml<-sum(OutPatterns$sml*as.numeric(OutPatterns$horseage),na.rm=TRUE)/sum(as.numeric(OutPatterns$sml),na.rm=TRUE)
PtrnAge.med<-sum(OutPatterns$med*as.numeric(OutPatterns$horseage),na.rm=TRUE)/sum(as.numeric(OutPatterns$med),na.rm=TRUE)
PtrnAge.lge<-sum(OutPatterns$lge*as.numeric(OutPatterns$horseage),na.rm=TRUE)/sum(as.numeric(OutPatterns$lge),na.rm=TRUE)

 
# as the winners tapply won’t include all runners, match back into winner variables to line up winners with runners
z<-match(PatternRunsSireID,PatternWinsSireIDTmp)
PatternWinsSire<-PatternWinsSireTmp[z]
PatternWinsSireOvs<-PatternWinsSireOvsTmp[z]
PatternWinsSireTiny<-PatternWinsSireTinyTmp[z]
PatternWinsSireSml<-PatternWinsSireSmlTmp[z]
PatternWinsSireMed<-PatternWinsSireMedTmp[z]
PatternWinsSireLge<-PatternWinsSireLgeTmp[z]

z<-which(is.na(PatternWinsSire))
PatternWinsSire[z]<-0
z<-which(is.na(PatternWinsSireOvs))
PatternWinsSireOvs[z]<-0
z<-which(is.na(PatternWinsSireTiny))
PatternWinsSireTiny[z]<-0
z<-which(is.na(PatternWinsSireSml))
PatternWinsSireSml[z]<-0
z<-which(is.na(PatternWinsSireMed))
PatternWinsSireMed[z]<-0
z<-which(is.na(PatternWinsSireLge))
PatternWinsSireLge[z]<-0

# repeat for handicaps

z<-match(HcapRunsSireID,HcapWinsSireIDTmp)
HcapWinsSire<-HcapWinsSireTmp[z]
HcapWinsSireOvs<-HcapWinsSireOvsTmp[z]
HcapWinsSireTiny<-HcapWinsSireTinyTmp[z]
HcapWinsSireSml<-HcapWinsSireSmlTmp[z]
HcapWinsSireMed<-HcapWinsSireMedTmp[z]
HcapWinsSireLge<-HcapWinsSireLgeTmp[z]

z<-which(is.na(HcapWinsSire))
HcapWinsSire[z]<-0
z<-which(is.na(HcapWinsSireOvs))
HcapWinsSireOvs[z]<-0
z<-which(is.na(HcapWinsSireTiny))
HcapWinsSireTiny[z]<-0
z<-which(is.na(HcapWinsSireSml))
HcapWinsSireSml[z]<-0
z<-which(is.na(HcapWinsSireMed))
HcapWinsSireMed[z]<-0
z<-which(is.na(HcapWinsSireLge))
HcapWinsSireLge[z]<-0

# calc IVs per sire
PatternIVsire<-(PatternWinsSire/sum(PatternWinsSire))/(PatternRunsSire/sum(PatternRunsSire))
HcapIVsire<-(HcapWinsSire/sum(HcapWinsSire))/(HcapRunsSire/sum(HcapRunsSire))

# calc IV adjusted runners by sire
#
PatternRunsSire.IV<- PatternRunsSire*PatternIVsire
PatternRunsSireOvs.IV<- PatternRunsSireOvs*PatternIVsire
PatternRunsSireTiny.IV<- PatternRunsSireTiny*PatternIVsire
PatternRunsSireSml.IV<- PatternRunsSireSml*PatternIVsire
PatternRunsSireMed.IV<- PatternRunsSireMed*PatternIVsire
PatternRunsSireLge.IV<- PatternRunsSireLge*PatternIVsire

HcapRunsSire.IV<- HcapRunsSire*HcapIVsire
HcapRunsSireOvs.IV<- HcapRunsSireOvs*HcapIVsire
HcapRunsSireTiny.IV<- HcapRunsSireTiny*HcapIVsire
HcapRunsSireSml.IV<- HcapRunsSireSml*HcapIVsire
HcapRunsSireMed.IV<- HcapRunsSireMed*HcapIVsire
HcapRunsSireLge.IV<- HcapRunsSireLge*HcapIVsire

 

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: