## ----style, eval=TRUE, echo=FALSE, results="asis"------------------------ BiocStyle::latex() ## ----library------------------------------------------------------------- library(HIBAG) ## ----plot, fig.width=8, fig.height=4, fig.align='center'----------------- # load the published parameter estimates from European ancestry # e.g., filename <- "HumanOmniExpressExome-European-HLA4-hg19.RData" # here, we use example data in the package filename <- system.file("extdata", "ModelList.RData", package="HIBAG") model.list <- get(load(filename)) # HLA imputation at HLA-A hla.id <- "A" model <- hlaModelFromObj(model.list[[hla.id]]) summary(model) # SNPs in the model head(model$snp.id) head(model$snp.position) # plot SNP information plot(model) ## ----eval=FALSE---------------------------------------------------------- ## ######################################################################### ## # Import your PLINK BED file ## # ## yourgeno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim") ## summary(yourgeno) ## ## # best-guess genotypes and all posterior probabilities ## pred.guess <- predict(model, yourgeno, type="response+prob") ## summary(pred.guess) ## ## pred.guess$value ## pred.guess$postprob ## ----build-model, eval=FALSE--------------------------------------------- ## library(HIBAG) ## ## # load HLA types and SNP genotypes in the package ## data(HLA_Type_Table, package="HIBAG") ## data(HapMap_CEU_Geno, package="HIBAG") ## ## # a list of HLA genotypes ## # e.g., train.HLA <- hlaAllele(sample.id, H1=c("01:02", "05:01", ...), ## # H2=c("02:01", "03:01", ...), locus="A") ## # the HLA type of the first individual is 01:02/02:01, ## # the second is 05:01/03:01, ... ## hla.id <- "A" ## train.HLA <- hlaAllele(HLA_Type_Table$sample.id, ## H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")], ## H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")], ## locus=hla.id, assembly="hg19") ## ## # selected SNPs: ## # the flanking region of 500kb on each side, ## # or an appropriate flanking size without sacrificing predictive accuracy ## region <- 500 # kb ## snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id, ## HapMap_CEU_Geno$snp.position, hla.id, region*1000, assembly="hg19") ## length(snpid) ## ## # training genotypes ## # import your PLINK BED file, ## # e.g., train.geno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim") ## # or, ## train.geno <- hlaGenoSubset(HapMap_CEU_Geno, ## snp.sel = match(snpid, HapMap_CEU_Geno$snp.id)) ## summary(train.geno) ## ## # train a HIBAG model ## set.seed(100) ## model <- hlaAttrBagging(train.HLA, train.geno, nclassifier=100) ## ----echo=FALSE---------------------------------------------------------- mobj <- get(load(system.file("extdata", "ModelList.RData", package="HIBAG"))) model <- hlaModelFromObj(mobj$A) ## ------------------------------------------------------------------------ summary(model) ## ----build-model-parallel, eval=FALSE------------------------------------ ## library(HIBAG) ## library(parallel) ## ## # load HLA types and SNP genotypes in the package ## data(HLA_Type_Table, package="HIBAG") ## data(HapMap_CEU_Geno, package="HIBAG") ## ## # a list of HLA genotypes ## # e.g., train.HLA <- hlaAllele(sample.id, H1=c("01:02", "05:01", ...), ## # H2=c("02:01", "03:01", ...), locus="A") ## # the HLA type of the first individual is 01:02/02:01, ## # the second is 05:01/03:01, ... ## hla.id <- "A" ## train.HLA <- hlaAllele(HLA_Type_Table$sample.id, ## H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")], ## H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")], ## locus=hla.id, assembly="hg19") ## ## # selected SNPs: ## # the flanking region of 500kb on each side, ## # or an appropriate flanking size without sacrificing predictive accuracy ## region <- 500 # kb ## snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id, ## HapMap_CEU_Geno$snp.position, hla.id, region*1000, assembly="hg19") ## length(snpid) ## ## # training genotypes ## # import your PLINK BED file, ## # e.g., train.geno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim") ## # or, ## train.geno <- hlaGenoSubset(HapMap_CEU_Geno, ## snp.sel = match(snpid, HapMap_CEU_Geno$snp.id)) ## summary(train.geno) ## ## ## # Create an environment with an appropriate cluster size, ## # e.g., 2 -- # of (CPU) cores ## cl <- makeCluster(2) ## ## # Building ... ## set.seed(1000) ## hlaParallelAttrBagging(cl, train.HLA, train.geno, nclassifier=100, ## auto.save="AutoSaveModel.RData", stop.cluster=TRUE) ## model.obj <- get(load("AutoSaveModel.RData")) ## model <- hlaModelFromObj(model.obj) ## summary(model) ## ----echo=FALSE---------------------------------------------------------- unlink("AutoSaveModel.RData", force=TRUE) ## ----evaluate------------------------------------------------------------ library(HIBAG) # load HLA types and SNP genotypes in the package data(HLA_Type_Table, package="HIBAG") data(HapMap_CEU_Geno, package="HIBAG") # make a list of HLA types hla.id <- "A" hla <- hlaAllele(HLA_Type_Table$sample.id, H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")], H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")], locus=hla.id, assembly="hg19") # divide HLA types randomly set.seed(100) hlatab <- hlaSplitAllele(hla, train.prop=0.5) names(hlatab) summary(hlatab$training) summary(hlatab$validation) # SNP predictors within the flanking region on each side region <- 500 # kb snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id, HapMap_CEU_Geno$snp.position, hla.id, region*1000, assembly="hg19") length(snpid) # training and validation genotypes train.geno <- hlaGenoSubset(HapMap_CEU_Geno, snp.sel = match(snpid, HapMap_CEU_Geno$snp.id), samp.sel = match(hlatab$training$value$sample.id, HapMap_CEU_Geno$sample.id)) summary(train.geno) test.geno <- hlaGenoSubset(HapMap_CEU_Geno, samp.sel=match( hlatab$validation$value$sample.id, HapMap_CEU_Geno$sample.id)) ## ----eval=FALSE---------------------------------------------------------- ## # train a HIBAG model ## set.seed(100) ## model <- hlaAttrBagging(hlatab$training, train.geno, nclassifier=100) ## ----echo=FALSE---------------------------------------------------------- mobj <- get(load(system.file("extdata", "OutOfBag.RData", package="HIBAG"))) model <- hlaModelFromObj(mobj) ## ------------------------------------------------------------------------ summary(model) # validation pred <- predict(model, test.geno) # compare comp <- hlaCompareAllele(hlatab$validation, pred, allele.limit=model, call.threshold=0) comp$overall ## ----out-text------------------------------------------------------------ # report overall accuracy, per-allele sensitivity, specificity, etc hlaReport(comp, type="txt") ## ----out-tex, results="asis"--------------------------------------------- # report overall accuracy, per-allele sensitivity, specificity, etc hlaReport(comp, type="tex", header=FALSE) ## ----release-model, eval=FALSE------------------------------------------- ## library(HIBAG) ## ## # make a list of HLA types ## hla.id <- "DQA1" ## hla <- hlaAllele(HLA_Type_Table$sample.id, ## H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")], ## H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")], ## locus=hla.id, assembly="hg19") ## ## # training genotypes ## region <- 500 # kb ## snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id, HapMap_CEU_Geno$snp.position, ## hla.id, region*1000, assembly="hg19") ## train.geno <- hlaGenoSubset(HapMap_CEU_Geno, ## snp.sel = match(snpid, HapMap_CEU_Geno$snp.id), ## samp.sel = match(hla$value$sample.id, HapMap_CEU_Geno$sample.id)) ## ## set.seed(1000) ## model <- hlaAttrBagging(hla, train.geno, nclassifier=100) ## summary(model) ## ## # remove unused SNPs and sample IDs from the model ## mobj <- hlaPublish(model, ## platform = "Illumina 1M Duo", ## information = "Training set -- HapMap Phase II", ## warning = NULL, ## rm.unused.snp=TRUE, anonymize=TRUE) ## ## save(mobj, file="Your_HIBAG_Model.RData") ## ----echo=FALSE---------------------------------------------------------- unlink("Your_HIBAG_Model.RData", force=TRUE) ## ----eval=FALSE---------------------------------------------------------- ## # assume the HIBAG models are stored in R objects: mobj.A, mobj.B, ... ## ## ModelList <- list() ## ModelList[["A"]] <- mobj.A ## ModelList[["B"]] <- mobj.B ## ... ## ## # save to an R data file ## save(ModelList, file="HIBAG_Model_List.RData") ## ----sessioninfo, results="asis"----------------------------------------- toLatex(sessionInfo())