Measuring Training Yard Success: R code

####################################################################
#
#
# Measuring Training Yard Success: Impact Values from Maidens, Handicaps and Pattern Races
# J. Hathorn
#
# v1.0
#
#
# written 18-Sep-13
#
#
###################################################################

#rm(list=ls())

library(foreign)
library(maptools)

# read in database files from RI
#
setwd(“C:/Program Files (x86)/RaceForm Interactive”)

RIhorse.data <-read.dbf(“horse.dbf”)
RIouting.data <-read.dbf(“outing.dbf”)
RIrace.data <-read.dbf(“race.dbf”)
RIsire.data <-read.dbf(“sire.dbf”)
RItrainer.data <- read.dbf(“trainer.dbf”)
RIcourse.data<-read.dbf(“course.dbf”)

# #############################################################
# set date parameters to focus on races between chosen dates
# flat season Lincoln to the November Handicap
chosenDateSt<-c(“2012-03-31”)
chosenDateEd<-c(“2012-11-10”)

# set dates for determining yard sizes, set the previous year to the November Handicap
#chosenDateSt1<-c(“2011-11-11”)
#chosenDateEd1<-c(“2012-11-10”)
#################################################
#
# extract GB course id list from course db
z<-which(RIcourse.data$CCOUNTRY == “GB”)

GBcourseids<-RIcourse.data$CID[z]
GBcoursenames<-RIcourse.data$CNAME[z]
#
# extract GB/IRE trainer lists from trainer db
z<-which(RItrainer.data$TCOUNTRY == “GB”)
GBtrainers<-RItrainer.data$TID[z]
z<-which(RItrainer.data$TCOUNTRY == “IRE”)
IREtrainers<-RItrainer.data$TID[z]
GBIREtrainers<-append(GBtrainers,IREtrainers)
#
#
##################################################

# select outings on the flat between the chosen dates
tmpidx<-which(RIouting.data$ODATE>=chosenDateSt & RIouting.data$ODATE<=chosenDateEd & RIouting.data$OFJ==”F”)
T1.data<-RIouting.data[tmpidx,]

# match the course and add a country variable
z<-match(T1.data$OCOURSEID,RIcourse.data$CID)
T1.data$COCOUNTRY<-NA
T1.data$COCOUNTRY<-RIcourse.data$CCOUNTRY[z]

T1a.data<-T1.data
# select outings that took place on GB courses and append the 2 d f
tmpidx<-which(T1.data$COCOUNTRY == “GB”)
T1.data<-T1.data[tmpidx,]

# #######################
# GB races only
# code to include IRE races if so desired
#tmpidx<-which(T1a.data$COCOUNTRY == “IRE”)
#T2.data<-T1a.data[tmpidx,]
#T1.data<-rbind(T1.data,T2.data)

# ######################
# age restriction if required
# reduce T1 to horses of the desired age given by the parameter agecheck
#agecheck<-2
#tmpidx<-which(T1.data$OAGE==agecheck)
#T1.data<-T1.data[tmpidx,]

# match each horse into the RIhorse d f to get the sire id
z<-match(T1.data$OHORSEID,RIhorse.data$HID)
T1.data$SIREID<-RIhorse.data$HSIREID[z]

# attach some of the race conditions ie age, stakes, handicap etc from the RIrace d f
z<-match(T1.data$ORACEID,RIrace.data$RID)
T1.data$RCOND<-RIrace.data$RCOND[z]
T1.data$RAGE<-RIrace.data$RAGE[z]
T1.data$RANIMAL<-RIrace.data$RANIMAL[z]
T1.data$RPATTERN<-RIrace.data$RPATTERN[z]
T1.data$RISHCAP<-RIrace.data$RISHCAP[z]

# set a 1/0 variable for winners and a 1 variable for runners will help aggregation later
T1.data$runner<-1
T1.data$winner<-0
z<-which(T1.data$OPOS==1)
T1.data$winner[z]<-1

# SP turned into a probability
T1.data$SPprob<-1/(1+T1.data$OSPVAL)

# reformat the ORF rating variable
T1.data$ORF<-as.character(T1.data$ORF)
T1.data$ORF<-gsub(“\\?”,””,T1.data$ORF)
T1.data$ORF<-gsub(“\\+”,””,T1.data$ORF)
T1.data$ORF<-as.numeric(T1.data$ORF)
tmpidx<-which(T1.data$ORF==0)
T1.data$ORF[tmpidx]<-NA
# reformat the OJC rating variable
T1.data$OJC<-as.character(T1.data$OJC)
T1.data$OJC<-gsub(“\\?”,””,T1.data$OJC)
T1.data$OJC<-gsub(“\\+”,””,T1.data$OJC)
T1.data$OJC<-as.numeric(T1.data$OJC)
tmpidx<-which(T1.data$OJC==0)
T1.data$OJC[tmpidx]<-NA

####################################################
#
# produce summary variables by horse – runs/wins/ratings etc
#
hid<-tapply(T1.data$OHORSEID,T1.data$OHORSEID,mean,na.rm=TRUE)
hruns<-tapply(T1.data$runner,T1.data$OHORSEID,sum,na.rm=TRUE)
hwins<-tapply(T1.data$winner,T1.data$OHORSEID,sum,na.rm=TRUE)
hwinner<-pmin(1,hwins)
hrunner<-pmin(1,hruns)
hORmax<-tapply(T1.data$OJC,T1.data$OHORSEID,max,na.rm=TRUE)
hRFmax<-tapply(T1.data$ORF,T1.data$OHORSEID,max,na.rm=TRUE)
z<-which(hORmax==-Inf)
hORmax[z]<-NA
z<-which(hRFmax==-Inf)
hRFmax[z]<-NA

z<-match(hid,T1.data$OHORSEID)
htrainerid<-T1.data$OTRAINERID[z]
z<-match(hid,RIhorse.data$HID)
hname<-RIhorse.data$HNAME[z]
# put these into a d f
HSummary<-data.frame(hid,hname,htrainerid,hruns,hwins,hwinner,hrunner,hORmax,hRFmax)

# ###############
# produce population wide summary stats
univ.OR.med<-median(HSummary$hORmax,na.rm=TRUE)
univ.RF.med<-median(HSummary$hRFmax,na.rm=TRUE)
univ.RF.sd<-sd(HSummary$hRFmax,na.rm=TRUE)
univ.RF.1sdup<-univ.RF.med+univ.RF.sd
univ.winners<-sum(HSummary$hwinner)
univ.runners<-sum(HSummary$hrunner)
univ.winpct<-univ.winners/univ.runners

HSummary$hRF1sdup<-0
z<-which(HSummary$hRFmax>univ.RF.1sdup)
HSummary$hRF1sdup[z]<-1
univ.RF.1sduppct<-sum(HSummary$hRF1sdup)/univ.runners

# ######################################################
#
# now take the horse summary df and produce a trainer summary based upon the horse summary d f

trainer.h<-tapply(HSummary$htrainerid,HSummary$htrainerid,mean,rm=TRUE)
wins.h<-tapply(HSummary$hwins,HSummary$htrainerid,sum,na.rm=TRUE)
runs.h<-tapply(HSummary$hruns,HSummary$htrainerid,sum,na.rm=TRUE)
winspct.h<-wins.h/runs.h
winner.h<-tapply(HSummary$hwinner,HSummary$htrainerid,sum,na.rm=TRUE)
runner.h<-tapply(HSummary$hrunner,HSummary$htrainerid,sum,na.rm=TRUE)
winpct.h<-winner.h/runner.h
#ORmax.med.h<-tapply(HSummary$hORmax,HSummary$htrainerid,median,na.rm=TRUE)
RFmax.med.h<-tapply(HSummary$hRFmax,HSummary$htrainerid,median,na.rm=TRUE)
RFmax.sd.h<-tapply(HSummary$hRFmax,HSummary$htrainerid,sd,na.rm=TRUE)
RFmax.up1sd.h<-RFmax.med.h+RFmax.sd.h
RF.1sdup.h<-tapply(HSummary$hRF1sdup,HSummary$htrainerid,sum,na.rm=TRUE)
RF.1sduppct.h<-RF.1sdup.h/runner.h

TrainerHorses<-data.frame(trainer.h,wins.h,runs.h,winspct.h,winner.h,runner.h,winpct.h,RFmax.med.h,RFmax.sd.h,RFmax.up1sd.h,RF.1sdup.h,RF.1sduppct.h)
z<-match(TrainerHorses$trainer.h,RItrainer.data$TID)
TrainerHorses$tname.h<-RItrainer.data$TSTYLENAME[z]

# write out this d f to a CSV file
#fname<-“c:/Racing Research/Trainer Research/trainerhorses.csv”
#write.csv(TrainerHorses,file=fname)

# ####################################################
#
# now go back to the Outing d f and split into race categories to get IVs etc by trainer
#
# split the races into different data frames
# Maidens
# Handicaps
# Pattern

# ###################################################
#
# put maidens into their own d f
z<-which(T1.data$RANIMAL==”MDN”)
T2.data<-T1.data[z,]

# set up Sire IVs in maidens
sire.ID<-tapply(T2.data$SIREID,T2.data$SIREID,mean,na.rm=TRUE)
sire.wins <- tapply(T2.data$winner,T2.data$SIREID,sum,na.rm=TRUE)
total.wins<-sum(sire.wins)
sire.runs <- tapply(T2.data$runner,T2.data$SIREID,sum,na.rm=TRUE)
total.runs<-sum(sire.runs)
sire.IV<-(sire.wins/total.wins)/(sire.runs/total.runs)

# bring sire IV back into the T2 d f and calc a sire adjusted run variable
z<-match(T2.data$SIREID,sire.ID)
T2.data$sire.IV<-sire.IV[z]
T2.data$runner.SA<-T2.data$runner*T2.data$sire.IV

# calc the trainer IVs in maidens
trainerID<-tapply(T2.data$OTRAINERID,T2.data$OTRAINERID,mean,na.rm=TRUE)
trainer.wins <- tapply(T2.data$winner,T2.data$OTRAINERID,sum,na.rm=TRUE)
total.wins<-sum(trainer.wins)
trainer.runs <- tapply(T2.data$runner,T2.data$OTRAINERID,sum,na.rm=TRUE)
trainer.runs.SA <- tapply(T2.data$runner.SA,T2.data$OTRAINERID,sum,na.rm=TRUE)
total.runs<-sum(trainer.runs)
total.runs.SA<-sum(trainer.runs.SA)
trainer.IV<-(trainer.wins/total.wins)/(trainer.runs/total.runs)
trainer.IV.SA<-(trainer.wins/total.wins)/(trainer.runs.SA/total.runs.SA)

#copy over to maiden specific variables
wins.mdns<-trainer.wins
runs.mdns<-trainer.runs
runs.mdns.SA<-trainer.runs.SA
IV.mdns<-trainer.IV
IV.SA.mdns<-trainer.IV.SA

# put into a maiden trainer summary d f
TrainerMdns<-data.frame(trainerID,wins.mdns,runs.mdns,runs.mdns.SA,IV.mdns,IV.SA.mdns)

# #####################################################
#
# put handicaps into their own d f
z<-which(T1.data$RISHCAP==”TRUE”)
T2.data<-T1.data[z,]

# set up Sire IVs in maidens
sire.ID<-tapply(T2.data$SIREID,T2.data$SIREID,mean,na.rm=TRUE)
sire.wins <- tapply(T2.data$winner,T2.data$SIREID,sum,na.rm=TRUE)
total.wins<-sum(sire.wins)
sire.runs <- tapply(T2.data$runner,T2.data$SIREID,sum,na.rm=TRUE)
total.runs<-sum(sire.runs)
sire.IV<-(sire.wins/total.wins)/(sire.runs/total.runs)

# bring sire IV back into the T2 d f and calc a sire adjusted run variable
z<-match(T2.data$SIREID,sire.ID)
T2.data$sire.IV<-sire.IV[z]
T2.data$runner.SA<-T2.data$runner*T2.data$sire.IV

# calc the trainer IVs in handicaps
trainerID<-tapply(T2.data$OTRAINERID,T2.data$OTRAINERID,mean,na.rm=TRUE)
trainer.wins <- tapply(T2.data$winner,T2.data$OTRAINERID,sum,na.rm=TRUE)
total.wins<-sum(trainer.wins)
trainer.runs <- tapply(T2.data$runner,T2.data$OTRAINERID,sum,na.rm=TRUE)
trainer.runs.SA <- tapply(T2.data$runner.SA,T2.data$OTRAINERID,sum,na.rm=TRUE)
total.runs<-sum(trainer.runs)
total.runs.SA<-sum(trainer.runs.SA)
trainer.IV<-(trainer.wins/total.wins)/(trainer.runs/total.runs)
trainer.IV.SA<-(trainer.wins/total.wins)/(trainer.runs.SA/total.runs.SA)

#copy over to handicap specific variables
wins.hcaps<-trainer.wins
runs.hcaps<-trainer.runs
runs.hcaps.SA<-trainer.runs.SA
IV.hcaps<-trainer.IV
IV.SA.hcaps<-trainer.IV.SA

# put into a handicap trainer summary d f
TrainerHcaps<-data.frame(trainerID,wins.hcaps,runs.hcaps,runs.hcaps.SA,IV.hcaps,IV.SA.hcaps)

# ###################################################
#
# put patterns into their own d f
z<-which(T1.data$RPATTERN !=”NOT” & T1.data$RISHCAP == “FALSE”)
T2.data<-T1.data[z,]

# set up Sire IVs in patterns
sire.ID<-tapply(T2.data$SIREID,T2.data$SIREID,mean,na.rm=TRUE)
sire.wins <- tapply(T2.data$winner,T2.data$SIREID,sum,na.rm=TRUE)
total.wins<-sum(sire.wins)
sire.runs <- tapply(T2.data$runner,T2.data$SIREID,sum,na.rm=TRUE)
total.runs<-sum(sire.runs)
sire.IV<-(sire.wins/total.wins)/(sire.runs/total.runs)

# bring sire IV back into the T2 d f and calc a sire adjusted run variable
z<-match(T2.data$SIREID,sire.ID)
T2.data$sire.IV<-sire.IV[z]
T2.data$runner.SA<-T2.data$runner*T2.data$sire.IV

# calc the trainer IVs in patterns
trainerID<-tapply(T2.data$OTRAINERID,T2.data$OTRAINERID,mean,na.rm=TRUE)
trainer.wins <- tapply(T2.data$winner,T2.data$OTRAINERID,sum,na.rm=TRUE)
total.wins<-sum(trainer.wins)
trainer.runs <- tapply(T2.data$runner,T2.data$OTRAINERID,sum,na.rm=TRUE)
trainer.runs.SA <- tapply(T2.data$runner.SA,T2.data$OTRAINERID,sum,na.rm=TRUE)
total.runs<-sum(trainer.runs)
total.runs.SA<-sum(trainer.runs.SA)
trainer.IV<-(trainer.wins/total.wins)/(trainer.runs/total.runs)
trainer.IV.SA<-(trainer.wins/total.wins)/(trainer.runs.SA/total.runs.SA)

#copy over to pattern specific variables
wins.ptns<-trainer.wins
runs.ptns<-trainer.runs
runs.ptns.SA<-trainer.runs.SA
IV.ptns<-trainer.IV
IV.SA.ptns<-trainer.IV.SA
# put into a pattern trainer summary d f
TrainerPtns<-data.frame(trainerID,wins.ptns,runs.ptns,runs.ptns.SA,IV.ptns,IV.SA.ptns)

# ###################################################
#
# merge the trainer summary d f s
#
Temp<-merge(TrainerMdns,TrainerHcaps,by.x=”trainerID”,by.y=”trainerID”,all.x=”TRUE”,all.y=”TRUE”)
Trainers<-merge(Temp,TrainerPtns,by.x=”trainerID”,by.y=”trainerID”,all.x=”TRUE”,all.y=”TRUE”)

z<-match(Trainers$trainerID,RItrainer.data$TID)
Trainers$tname<-RItrainer.data$TSTYLENAME[z]
Trainers$country<-RItrainer.data$TCOUNTRY[z]

# merge in the ratings d f
#
TrainersAll<-merge(Trainers,TrainerHorses,by.x=”trainerID”,by.y=”trainer.h”,all.x=TRUE,all.y=TRUE)
#
#
# clean up soome of the variables, replace NA by 0
#
z<-which(is.na(TrainersAll$runs.mdns))
TrainersAll$runs.mdns[z]<-0
z<-which(is.na(TrainersAll$runs.hcaps))
TrainersAll$runs.hcaps[z]<-0
z<-which(is.na(TrainersAll$runs.ptns))
TrainersAll$runs.ptns[z]<-0
z<-which(is.na(TrainersAll$runs.mdns.SA))
TrainersAll$runs.mdns.SA[z]<-0
z<-which(is.na(TrainersAll$runs.hcaps.SA))
TrainersAll$runs.hcaps.SA[z]<-0
z<-which(is.na(TrainersAll$runs.ptns.SA))
TrainersAll$runs.ptns.SA[z]<-0
z<-which(is.na(TrainersAll$wins.mdns))
TrainersAll$wins.mdns[z]<-0
z<-which(is.na(TrainersAll$wins.hcaps))
TrainersAll$wins.hcaps[z]<-0
z<-which(is.na(TrainersAll$wins.ptns))
TrainersAll$wins.ptns[z]<-0
z<-which(is.na(TrainersAll$IV.mdns))
TrainersAll$IV.mdns[z]<-0
z<-which(is.na(TrainersAll$IV.SA.mdns))
TrainersAll$IV.SA.mdns[z]<-0
z<-which(is.na(TrainersAll$IV.hcaps))
TrainersAll$IV.hcaps[z]<-0
z<-which(is.na(TrainersAll$IV.SA.hcaps))
TrainersAll$IV.SA.hcaps[z]<-0
z<-which(is.na(TrainersAll$IV.ptns))
TrainersAll$IV.ptns[z]<-0
z<-which(is.na(TrainersAll$IV.SA.ptns))
TrainersAll$IV.SA.ptns[z]<-0

TrainersAll$wins.all<-TrainersAll$wins.mdns+TrainersAll$wins.hcaps+TrainersAll$wins.ptns
TrainersAll$runs.all<-TrainersAll$runs.mdns+TrainersAll$runs.hcaps+TrainersAll$runs.ptns
TrainersAll$runs.all.SA<-TrainersAll$runs.mdns.SA+TrainersAll$runs.hcaps.SA+TrainersAll$runs.ptns.SA

# produce summary stats
#
# composite IVs weighted by all runs in maidens, handicaps and pattern races

TrainersAll$IVcomp1<-(TrainersAll$IV.mdns*sum(TrainersAll$runs.mdns,na.rm=TRUE)+TrainersAll$IV.hcaps*sum(TrainersAll$runs.hcaps,na.rm=TRUE)
+TrainersAll$IV.ptns*sum(TrainersAll$runs.ptns,na.rm=TRUE))/(sum(TrainersAll$runs.mdns,na.rm=TRUE)+sum(TrainersAll$runs.hcaps,na.rm=TRUE)+sum(TrainersAll$runs.ptns,na.rm=TRUE))

TrainersAll$IVcomp1.SA<-(TrainersAll$IV.SA.mdns*sum(TrainersAll$runs.mdns.SA,na.rm=TRUE)+TrainersAll$IV.SA.hcaps*sum(TrainersAll$runs.hcaps.SA,na.rm=TRUE)
+TrainersAll$IV.SA.ptns*sum(TrainersAll$runs.ptns.SA,na.rm=TRUE))/(sum(TrainersAll$runs.mdns.SA,na.rm=TRUE)+sum(TrainersAll$runs.hcaps.SA,na.rm=TRUE)+sum(TrainersAll$runs.ptns.SA,na.rm=TRUE))

TrainersAll$IVcomp2<-(TrainersAll$IV.mdns*TrainersAll$runs.mdns+TrainersAll$IV.hcaps*TrainersAll$runs.hcaps
+TrainersAll$IV.ptns*TrainersAll$runs.ptns)/(TrainersAll$runs.mdns+TrainersAll$runs.hcaps+TrainersAll$runs.ptns)

TrainersAll$IVcomp2.SA<-(TrainersAll$IV.SA.mdns*TrainersAll$runs.mdns.SA+TrainersAll$IV.SA.hcaps*TrainersAll$runs.hcaps.SA
+TrainersAll$IV.SA.ptns*TrainersAll$runs.ptns.SA)/(TrainersAll$runs.mdns.SA+TrainersAll$runs.hcaps.SA+TrainersAll$runs.ptns.SA)

# difference variables, hcaps – mdns
TrainersAll$IVdiff.hcapsmdns<-TrainersAll$IV.hcaps-TrainersAll$IV.mdns
TrainersAll$IVdiff.hcapsmdns.SA<-TrainersAll$IV.SA.hcaps-TrainersAll$IV.SA.mdns

# quality differences using composites
TrainersAll$IVdiff.comp1.SAraw<-TrainersAll$IVcomp1.SA-TrainersAll$IVcomp1
TrainersAll$IVdiff.comp2.SAraw<-TrainersAll$IVcomp2.SA-TrainersAll$IVcomp2
# reduce the list to those trainers that have had >=50 runs in handicaps and are GB based and more than 2*50 runs in total
minruns<-50
z<-which(TrainersAll$runs.hcaps >= minruns & TrainersAll$country==”GB” & TrainersAll$runs.all >= 2*minruns)
Temp<-TrainersAll[z,]
TrainersAll50GB<-Temp[order(-Temp$IVcomp2.SA),]

# write out this d f to a CSV file
fname<-“c:/Racing Research/Trainer Research/trainersall50gb.csv”
write.csv(TrainersAll50GB,file=fname)

slcutoff<-40
z<-which(TrainersAll50GB$runner.h < slcutoff)
TrainersSmall50GB<-TrainersAll50GB[z,]
fname<-“c:/Racing Research/Trainer Research/trainerssmall50gb.csv”
write.csv(TrainersSmall50GB,file=fname)
z<-which(TrainersAll50GB$runner.h >= slcutoff)
TrainersLarge50GB<-TrainersAll50GB[z,]
fname<-“c:/Racing Research/Trainer Research/trainerslarge50gb.csv”
write.csv(TrainersLarge50GB,file=fname)

Do Small Training Yards Punch Above Their Weight ? : R Code

####################################################################
#
# R Code to accompany the blog post ‘Do small Training Yards Punch Above Their Weight?

#

# SmallTrainerAnalysis.R
#
# J. Hathorn
#
# v1.0
#
#
# Code to look at whether small yards punch above their weight
#
# written 10-Sep-13
#
#
###################################################################

#rm(list=ls())

library(foreign)
library(maptools)

# read in database files from RI
#
setwd(“C:/Program Files (x86)/RaceForm Interactive”)

RIhorse.data <-read.dbf(“horse.dbf”)
RIouting.data <-read.dbf(“outing.dbf”)
RIrace.data <-read.dbf(“race.dbf”)
RIsire.data <-read.dbf(“sire.dbf”)
RItrainer.data <- read.dbf(“trainer.dbf”)
RIcourse.data<-read.dbf(“course.dbf”)

# #############################################################
# set date parameters to focus on races between chosen dates
# flat season Lincoln to the November Handicap
chosenDateSt<-c(“2012-03-31”)
chosenDateEd<-c(“2012-11-10”)

# set dates for determining yard sizes, set the previous year to the November Handicap
chosenDateSt1<-c(“2011-11-11”)
chosenDateEd1<-c(“2012-11-10”)
#################################################
#
# extract GB course id list from course db
z<-which(RIcourse.data$CCOUNTRY == “GB”)

GBcourseids<-RIcourse.data$CID[z]
GBcoursenames<-RIcourse.data$CNAME[z]
#
# extract GB/IRE trainer lists from trainer db
z<-which(RItrainer.data$TCOUNTRY == “GB”)
GBtrainers<-RItrainer.data$TID[z]
z<-which(RItrainer.data$TCOUNTRY == “IRE”)
IREtrainers<-RItrainer.data$TID[z]
GBIREtrainers<-append(GBtrainers,IREtrainers)
#
#
##################################################

# select outings on the flat between the chosen dates to categorise trainers
tmpidx<-which(RIouting.data$ODATE>=chosenDateSt1 & RIouting.data$ODATE<=chosenDateEd1)
T1.data<-RIouting.data[tmpidx,]

# match the course and add a country variable
z<-match(T1.data$OCOURSEID,RIcourse.data$CID)
T1.data$COCOUNTRY<-NA
T1.data$COCOUNTRY<-RIcourse.data$CCOUNTRY[z]

T1a.data<-T1.data
# select outings that took place on GB and IRE courses and append the 2 d f
tmpidx<-which(T1.data$COCOUNTRY == “GB”)
T1.data<-T1.data[tmpidx,]

tmpidx<-which(T1a.data$COCOUNTRY == “IRE”)
T2.data<-T1a.data[tmpidx,]

T1.data<-rbind(T1.data,T2.data)
# put the horse and trainer IDs into unique variables and add trainer name and country and horse name
horse<-T1.data$OHORSEID
trainer<-T1.data$OTRAINERID
uhorse<-unique(horse)
z<-match(uhorse,horse)
utrainer<-trainer[z]

# match the trainer name/domicile set up domestic/foreign variable
z<-match(utrainer,RItrainer.data$TID)
utrainerhome<-RItrainer.data$TCOUNTRY[z]
utrainername<-RItrainer.data$TSTYLENAME[z]
domestic<-0
z<-which(utrainerhome==”GB”)
domestic[z]<-1
z<-which(utrainerhome==”IRE”)
domestic[z]<-1
z<-which(is.na(domestic))
domestic[z]<-0

# match the horse name and calculate the horse age
z<-match(uhorse,RIhorse.data$HID)
uhorsename<-RIhorse.data$HNAME[z]
uhorsefdate<-RIhorse.data$HFOALDATE[z]
uhorseage<-(as.Date(chosenDateEd)-uhorsefdate)/365
# aggregate to num horses by trainer
trainerid<-tapply(utrainer,utrainer,mean)
numhorses<-tapply(uhorse,utrainer,length)
z<-match(trainerid,RItrainer.data$TID)
trainerctry<-RItrainer.data$TCOUNTRY[z]
trainerdomestic<-0
z<-which(trainerctry==”GB”)
trainerdomestic[z]<-1
z<-which(trainerctry==”IRE”)
trainerdomestic[z]<-1
z<-which(is.na(trainerdomestic))
trainerdomestic[z]<-0
# allocate trainers to overseas/tiny/small/medium/large categories in the variable size
tiny<-5
sml<-25
med<-75
size<-NA

t1<-which(trainerdomestic==0)
size[t1]<-“OVS”

t1<-which(trainerdomestic==1 & numhorses<= tiny)
size[t1]<-“TINY”
t1<-which(trainerdomestic==1 & numhorses>tiny & numhorses <=sml)
size[t1]<-“SMALL”
t1<-which(trainerdomestic==1 & numhorses>sml & numhorses <=med)
size[t1]<-“MEDIUM”
t1<-which(trainerdomestic==1 & numhorses>med)
size[t1]<-“LARGE”

size<-as.factor(size)
# total/avg horses by size category
horsesbycat<-tapply(numhorses,size,sum)
avgbycat<-tapply(numhorses,size,mean,na.rm=TRUE)

# match the size by trainer back to the unique horse vector, will be useful later
#
z<-match(utrainer,trainerid)
utrainersize<-size[z]

###############################################
#
# go to the race file and calculate how many GB handicap and pattern races and runners in the period examined
#
tmpidx<-which(RIrace.data$RDATE>=chosenDateSt & RIrace.data$RDATE<=chosenDateEd & RIrace.data$RFJ==”F”)
R1.data<-RIrace.data[tmpidx,]

z<-match(R1.data$RCOURSEID,RIcourse.data$CID)
R1.data$COCOUNTRY<-NA
R1.data$COCOUNTRY<-RIcourse.data$CCOUNTRY[z]

# select outings that took place on GB courses
tmpidx<-which(R1.data$COCOUNTRY == “GB”)
R1.data<-R1.data[tmpidx,]

# match the winning horse into uhorse to find trainer id/name, trainer home and size category
z<-match(R1.data$RWINHRSID,uhorse)
R1.data$trainerid<-utrainer[z]
R1.data$trainername<-utrainername[z]
R1.data$trainersize<-utrainersize[z]
R1.data$trainerhome<-utrainerhome[z]
R1.data$horseage<-uhorseage[z]

# set up 1/0 values for aggregation later by trainer category
z<-which(R1.data$trainersize==”OVS”)
R1.data$ovs<-NA
R1.data$ovs[z]<-1
z<-which(is.na(R1.data$ovs))
R1.data$ovs[z]<-0

z<-which(R1.data$trainersize==”TINY”)
R1.data$tiny<-NA
R1.data$tiny[z]<-1
z<-which(is.na(R1.data$tiny))
R1.data$tiny[z]<-0

z<-which(R1.data$trainersize==”SMALL”)
R1.data$sml<-NA
R1.data$sml[z]<-1
z<-which(is.na(R1.data$sml))
R1.data$sml[z]<-0

z<-which(R1.data$trainersize==”MEDIUM”)
R1.data$med<-NA
R1.data$med[z]<-1
z<-which(is.na(R1.data$med))
R1.data$med[z]<-0

z<-which(R1.data$trainersize==”LARGE”)
R1.data$lge<-NA
R1.data$lge[z]<-1
z<-which(is.na(R1.data$lge))
R1.data$lge[z]<-0
# match the winning horse into the RIhorse d f to get the sire id
z<-match(R1.data$RWINHRSID,RIhorse.data$HID)
R1.data$WINSIREID<-RIhorse.data$HSIREID[z]

# produce pattern only and handicap only d f
#
z<-which(R1.data$RPATTERN != “NOT” & R1.data$RISHCAP==”FALSE”)
Patterns<-R1.data[z,]

z<-which(R1.data$RISHCAP== “TRUE”)
Hcaps<-R1.data[z,]

# summarise winners for patterns/hcaps by sire ID in total and by trainer category
#
PatternWinsSireIDTmp<-tapply(Patterns$WINSIREID,Patterns$WINSIREID,mean)
PatternWinsSireTmp<-tapply(Patterns$WINSIREID,Patterns$WINSIREID,length)
PatternWinsSireOvsTmp<-tapply(Patterns$ovs,Patterns$WINSIREID,sum)
PatternWinsSireTinyTmp<-tapply(Patterns$tiny,Patterns$WINSIREID,sum)
PatternWinsSireSmlTmp<-tapply(Patterns$sml,Patterns$WINSIREID,sum)
PatternWinsSireMedTmp<-tapply(Patterns$med,Patterns$WINSIREID,sum)
PatternWinsSireLgeTmp<-tapply(Patterns$lge,Patterns$WINSIREID,sum)

HcapWinsSireIDTmp<-tapply(Hcaps$WINSIREID,Hcaps$WINSIREID,mean)
HcapWinsSireTmp<-tapply(Hcaps$WINSIREID,Hcaps$WINSIREID,length)
HcapWinsSireOvsTmp<-tapply(Hcaps$ovs,Hcaps$WINSIREID,sum)
HcapWinsSireTinyTmp<-tapply(Hcaps$tiny,Hcaps$WINSIREID,sum)
HcapWinsSireSmlTmp<-tapply(Hcaps$sml,Hcaps$WINSIREID,sum)
HcapWinsSireMedTmp<-tapply(Hcaps$med,Hcaps$WINSIREID,sum)
HcapWinsSireLgeTmp<-tapply(Hcaps$lge,Hcaps$WINSIREID,sum)

# summarise winners for patterns/hcaps by trainer size
PatternWinsTrainers<-tapply(Patterns$trainersize,Patterns$trainersize,length)
HcapWinsTrainers<-tapply(Hcaps$trainersize,Hcaps$trainersize,length)

#########################
#
# find out number of runs by category for each race type
#
z<-match(T1.data$OTRAINERID,utrainer)
T1.data$trainername<-utrainername[z]
T1.data$trainersize<-utrainersize[z]
T1.data$trainerhome<-utrainerhome[z]
T1.data$horseage<-uhorseage[z]

# set up 1/0 values for aggregation later by trainer category
z<-which(T1.data$trainersize==”OVS”)
T1.data$ovs<-NA
T1.data$ovs[z]<-1
z<-which(is.na(T1.data$ovs))
T1.data$ovs[z]<-0

z<-which(T1.data$trainersize==”TINY”)
T1.data$tiny<-NA
T1.data$tiny[z]<-1
z<-which(is.na(T1.data$tiny))
T1.data$tiny[z]<-0

z<-which(T1.data$trainersize==”SMALL”)
T1.data$sml<-NA
T1.data$sml[z]<-1
z<-which(is.na(T1.data$sml))
T1.data$sml[z]<-0

z<-which(T1.data$trainersize==”MEDIUM”)
T1.data$med<-NA
T1.data$med[z]<-1
z<-which(is.na(T1.data$med))
T1.data$med[z]<-0

z<-which(T1.data$trainersize==”LARGE”)
T1.data$lge<-NA
T1.data$lge[z]<-1
z<-which(is.na(T1.data$lge))
T1.data$lge[z]<-0

# bring in race types

z<-match(T1.data$ORACEID,R1.data$RID)
T1.data$RPATTERN<-R1.data$RPATTERN[z]
T1.data$RISHCAP<-R1.data$RISHCAP[z]

# match each horse into the RIhorse d f to get the sire id
z<-match(T1.data$OHORSEID,RIhorse.data$HID)
T1.data$SIREID<-RIhorse.data$HSIREID[z]

# produce pattern only and handicap only outing d f
#
z<-which(T1.data$RPATTERN != “NOT” & T1.data$RISHCAP==”FALSE”)
OutPatterns<-T1.data[z,]

z<-which(T1.data$RISHCAP== “TRUE”)
OutHcaps<-T1.data[z,]

# summarise runners for patterns/hcaps by trainer size
PatternRunsTrainers<-tapply(OutPatterns$trainersize,OutPatterns$trainersize,length)
HcapRunsTrainers<-tapply(OutHcaps$trainersize,OutHcaps$trainersize,length)

# summarise runners for patterns/hcaps by sire ID
PatternRunsSireID<-tapply(OutPatterns$SIREID,OutPatterns$SIREID,mean)
PatternRunsSire<-tapply(OutPatterns$SIREID,OutPatterns$SIREID,length)
PatternRunsSireOvs<-tapply(OutPatterns$ovs,OutPatterns$SIREID,sum)
PatternRunsSireTiny<-tapply(OutPatterns$tiny,OutPatterns$SIREID,sum)
PatternRunsSireSml<-tapply(OutPatterns$sml,OutPatterns$SIREID,sum)
PatternRunsSireMed<-tapply(OutPatterns$med,OutPatterns$SIREID,sum)
PatternRunsSireLge<-tapply(OutPatterns$lge,OutPatterns$SIREID,sum)

HcapRunsSireID<-tapply(OutHcaps$SIREID,OutHcaps$SIREID,mean)
HcapRunsSire<-tapply(OutHcaps$SIREID,OutHcaps$SIREID,length)
HcapRunsSireOvs<-tapply(OutHcaps$ovs,OutHcaps$SIREID,sum)
HcapRunsSireTiny<-tapply(OutHcaps$tiny,OutHcaps$SIREID,sum)
HcapRunsSireSml<-tapply(OutHcaps$sml,OutHcaps$SIREID,sum)
HcapRunsSireMed<-tapply(OutHcaps$med,OutHcaps$SIREID,sum)
HcapRunsSireLge<-tapply(OutHcaps$lge,OutHcaps$SIREID,sum)

# calc the average age of the horses run by each trainer category in handicaps
#
HcapAge.ovs<-sum(OutHcaps$ovs*as.numeric(OutHcaps$horseage),na.rm=TRUE)/sum(as.numeric(OutHcaps$ovs),na.rm=TRUE)
HcapAge.tiny<-sum(OutHcaps$tiny*as.numeric(OutHcaps$horseage),na.rm=TRUE)/sum(as.numeric(OutHcaps$tiny),na.rm=TRUE)
HcapAge.sml<-sum(OutHcaps$sml*as.numeric(OutHcaps$horseage),na.rm=TRUE)/sum(as.numeric(OutHcaps$sml),na.rm=TRUE)
HcapAge.med<-sum(OutHcaps$med*as.numeric(OutHcaps$horseage),na.rm=TRUE)/sum(as.numeric(OutHcaps$med),na.rm=TRUE)
HcapAge.lge<-sum(OutHcaps$lge*as.numeric(OutHcaps$horseage),na.rm=TRUE)/sum(as.numeric(OutHcaps$lge),na.rm=TRUE)
#
PtrnAge.ovs<-sum(OutPatterns$ovs*as.numeric(OutPatterns$horseage),na.rm=TRUE)/sum(as.numeric(OutPatterns$ovs),na.rm=TRUE)
PtrnAge.tiny<-sum(OutPatterns$tiny*as.numeric(OutPatterns$horseage),na.rm=TRUE)/sum(as.numeric(OutPatterns$tiny),na.rm=TRUE)
PtrnAge.sml<-sum(OutPatterns$sml*as.numeric(OutPatterns$horseage),na.rm=TRUE)/sum(as.numeric(OutPatterns$sml),na.rm=TRUE)
PtrnAge.med<-sum(OutPatterns$med*as.numeric(OutPatterns$horseage),na.rm=TRUE)/sum(as.numeric(OutPatterns$med),na.rm=TRUE)
PtrnAge.lge<-sum(OutPatterns$lge*as.numeric(OutPatterns$horseage),na.rm=TRUE)/sum(as.numeric(OutPatterns$lge),na.rm=TRUE)

 
# as the winners tapply won’t include all runners, match back into winner variables to line up winners with runners
z<-match(PatternRunsSireID,PatternWinsSireIDTmp)
PatternWinsSire<-PatternWinsSireTmp[z]
PatternWinsSireOvs<-PatternWinsSireOvsTmp[z]
PatternWinsSireTiny<-PatternWinsSireTinyTmp[z]
PatternWinsSireSml<-PatternWinsSireSmlTmp[z]
PatternWinsSireMed<-PatternWinsSireMedTmp[z]
PatternWinsSireLge<-PatternWinsSireLgeTmp[z]

z<-which(is.na(PatternWinsSire))
PatternWinsSire[z]<-0
z<-which(is.na(PatternWinsSireOvs))
PatternWinsSireOvs[z]<-0
z<-which(is.na(PatternWinsSireTiny))
PatternWinsSireTiny[z]<-0
z<-which(is.na(PatternWinsSireSml))
PatternWinsSireSml[z]<-0
z<-which(is.na(PatternWinsSireMed))
PatternWinsSireMed[z]<-0
z<-which(is.na(PatternWinsSireLge))
PatternWinsSireLge[z]<-0

# repeat for handicaps

z<-match(HcapRunsSireID,HcapWinsSireIDTmp)
HcapWinsSire<-HcapWinsSireTmp[z]
HcapWinsSireOvs<-HcapWinsSireOvsTmp[z]
HcapWinsSireTiny<-HcapWinsSireTinyTmp[z]
HcapWinsSireSml<-HcapWinsSireSmlTmp[z]
HcapWinsSireMed<-HcapWinsSireMedTmp[z]
HcapWinsSireLge<-HcapWinsSireLgeTmp[z]

z<-which(is.na(HcapWinsSire))
HcapWinsSire[z]<-0
z<-which(is.na(HcapWinsSireOvs))
HcapWinsSireOvs[z]<-0
z<-which(is.na(HcapWinsSireTiny))
HcapWinsSireTiny[z]<-0
z<-which(is.na(HcapWinsSireSml))
HcapWinsSireSml[z]<-0
z<-which(is.na(HcapWinsSireMed))
HcapWinsSireMed[z]<-0
z<-which(is.na(HcapWinsSireLge))
HcapWinsSireLge[z]<-0

# calc IVs per sire
PatternIVsire<-(PatternWinsSire/sum(PatternWinsSire))/(PatternRunsSire/sum(PatternRunsSire))
HcapIVsire<-(HcapWinsSire/sum(HcapWinsSire))/(HcapRunsSire/sum(HcapRunsSire))

# calc IV adjusted runners by sire
#
PatternRunsSire.IV<- PatternRunsSire*PatternIVsire
PatternRunsSireOvs.IV<- PatternRunsSireOvs*PatternIVsire
PatternRunsSireTiny.IV<- PatternRunsSireTiny*PatternIVsire
PatternRunsSireSml.IV<- PatternRunsSireSml*PatternIVsire
PatternRunsSireMed.IV<- PatternRunsSireMed*PatternIVsire
PatternRunsSireLge.IV<- PatternRunsSireLge*PatternIVsire

HcapRunsSire.IV<- HcapRunsSire*HcapIVsire
HcapRunsSireOvs.IV<- HcapRunsSireOvs*HcapIVsire
HcapRunsSireTiny.IV<- HcapRunsSireTiny*HcapIVsire
HcapRunsSireSml.IV<- HcapRunsSireSml*HcapIVsire
HcapRunsSireMed.IV<- HcapRunsSireMed*HcapIVsire
HcapRunsSireLge.IV<- HcapRunsSireLge*HcapIVsire