#*********************************************************************************************************** #*********************************************************************************************************** #********* blend1.R #********* #********* Copyright (C) 2014 Diamond Light Source & Imperial College London #********* #********* Authors: James Foadi & Gwyndaf Evans #********* #********* This code is distributed under the BSD license, a copy of which is #********* included in the root directory of this package. #************************************************************************************************************ #************************************************************************************************************ # First R module of program BLEND (later to be replaced by C++ code) # It relies on files produced by blend.cpp. # # # Load libraries and auxiliary files require(MASS, quietly = TRUE, warn.conflicts = FALSE) ## For rlm (robust regression) # Functions # # Clean datasets. Basically only accept intensities whose partials add up to # a fraction between acmin and acmax cleanData <- function(data,acmin=0.95,acmax=1.05) { data[(data$V5 == "N" & data$V6 < acmin) | (data$V5 == "N" & data$V6 > acmax),5] <- NA data <- na.omit(data) return(data) } # # Determine if radiation damage detection and cure are to be activated for this data set. # Minimum number of images to activate procedure is 6; minimum resolution is 5 A. # Returns TRUE for ok, and FALSE if not ok. okRadDam <- function(data,minIma=6,minReso=5) { ansR <- TRUE # Number of images #images <- data$V4 tmp <- range(data$V4,na.rm=TRUE) rima <- tmp[2]-tmp[1] #if (length(images) < minIma) ansR <- FALSE if (rima < minIma) ansR <- FALSE # Resolution range resos <- data$V9 if (1/max(resos) > minReso) ansR <- FALSE return(ansR) } # aveIvsRes <- function(nbin,ss,II=NULL,m=min(ss),M=max(ss)) { # Resize vectors if needed idx <- which(ss >= m & ss <= M) # Return NULL if no elements are included inside min and max range if (length(idx) == 0) { return(list(NULL,NULL,NULL,NULL)) } # If there are elements carry on ss <- ss[idx] if (length(II) > 0) II <- II[idx] # Break points breaks <- seq(m,M,length=(nbin+1)) # Histogram hst <- hist(ss,breaks=breaks,plot=FALSE) # Partition intensity in resolution bins if (length(II) > 0) { aveI <- c() sdI <- c() for (i in 1:nbin) { idx <- which(ss >= breaks[i] & ss < breaks[i+1]) if (i == nbin) idx <- which(ss >= breaks[i] & ss <= breaks[i+1]) if (length(idx) > 0) { aveI <- c(aveI,mean(II[idx],na.rm=TRUE)) sdI <- c( sdI, sd(II[idx],na.rm=TRUE)) } if (length(idx) == 0) { aveI <- c(aveI,NA) sdI <- c(sdI,NA) } } } if (length(II) == 0) { aveI <- NULL sdI <- NULL } return(list(hst$counts,hst$mids,aveI,sdI)) } # Wplots <- function(data,nbin,nwin=1,m=NA,M=NA) { # Analysis of Wilson's plot shape evolution in time (batch dependency) # Batch column is number 4 # Full/Partial flag is column 5 # Intensity column is number 7 # sigI column is number 8 # Resolution column is number 9 # Extract batches batches <- data[,4] # Range of batches across which to investigate plot evolution rbat <- sort(unique(batches)) if (length(rbat) < (2*nwin+1)) stop("Number of batches too small for meaningful modeling") # Just to extract range (yrange will be worked out in the loop) ltmp <- aveIvsRes(nbin,data[,9]) xrange <- range(ltmp[[2]]) if (!is.na(m)) xrange[1] <- m if (!is.na(M)) xrange[2] <- M # Cycle through increasing number of batches (from early to late time during data collection) for (i in (nwin+1):(length(rbat)-nwin)) { # Select resolutions and intensities (standardized or not) within batch range ileft <- rbat[(i-nwin)] iright <- rbat[(i+nwin)] s <- data[data[,4] >= ileft & data[,4] <= iright,9] nI <- data[data[,4] >= ileft & data[,4] <= iright,7] # Create elements of Wilson's plots ltmp <- aveIvsRes(nbin,s,nI,m=xrange[1],M=xrange[2]) if (i == (nwin+1)) { ss <- matrix(ltmp[[2]],nrow=1,byrow=TRUE) II <- matrix(ltmp[[3]],nrow=1,byrow=TRUE) sDD <- matrix(ltmp[[4]],nrow=1,byrow=TRUE) } if (i != (nwin+1)) { ss <- rbind(ss,ltmp[[2]]) II <- rbind(II,ltmp[[3]]) sDD <- rbind(sDD,ltmp[[4]]) } } return(list(ss=ss,II=II,sDD=sDD)) } # mrad <- function(wplots,addbit=0.1) { # Find out when plots start to be empty ikey <- checkEmptyPlots(wplots) # Resolutions to be added to final list ss <- wplots[[1]][1,ikey] # Models # Exponential I0 <- c() B <- c() RR <- c() for (i in ikey) { y0 <- wplots[[2]][,i] minY <- min(y0,na.rm=TRUE) if (minY > 0) y <- y0 if (minY <= 0) y <- y0-minY+addbit #y[y == 0] <- NA # To avoid NaN and -Inf in logarithm of zero x <- 1:length(y) model <- lm(log(y)~x,na.action=na.omit) cff <- unname(coef(model)) if (minY > 0) I0 <- c(I0,exp(cff[1])) if (minY < 0) I0 <- c(I0,exp(cff[1])+minY-addbit) B <- c(B,-cff[2]) RR <- c(RR,summary(model)[[8]][1]) } lista_exponential <- list(s=ss,I0=I0,B=B,RR=RR) return(lista_exponential) } # # As many bins of each Wilson plot can be including exclusively NA's, # this function extracts bin number for all bins with some data checkEmptyPlots <- function(wplots) { nbin <- dim(wplots$II)[2] ikey <- c() for (i in 1:nbin) { nna <- sum(is.na(wplots[[2]][,i])) if (nna < dim(wplots$II)[1]) ikey <- c(ikey,i) #if (nna == 0) ikey <- c(ikey,i) } return(ikey) } # # New procedure to find out radiation damage and cut off images newraddam <- function(wplots) { # Radiation damage routines tmp <- mrad(wplots) model <- rlm(tmp$B ~ tmp$s,maxit=40) coefs <- summary(model)$coefficients[2,1:2] attributes(coefs) <- NULL ans <- NULL if (!is.na(coefs[1])) { if (coefs[1] < 0) ans <- FALSE } if (is.na(coefs[1])) ans <- FALSE if (!is.na(coefs[2])) { if (!is.na(coefs[1])) { if (coefs[2] > 0) { if (coefs[1]-coefs[2] <= 0) ans <- FALSE if (coefs[1]-coefs[2] > 0) ans <- TRUE } } } if (is.na(coefs[2])) ans <- FALSE # If ans is FALSE return infinity if (ans) { coefs <- model$coefficients attributes(coefs) <- NULL } if (!ans) { coefs <- c(-Inf,Inf) } return(coefs) } # # Function returning cutting curve, i.e. curve returning from which image # one should cut off at each resolution tfit <- function(s,coefs,fdrop=0.75) { Be <- coefs[2]*s+coefs[1] if (Be > 0) te <- -log(fdrop)/Be if (Be <= 0) te <- NA return(te) } # To extract a list with datasets to be merged, as many as the nodes of a dendrogram (htree) # This list can be subsequently used in "merge_datasets" (in a loop) to extract statistics # for each node of the dendrogram treeToList <- function(htree,lres,hres) { # Function to list all groups (and resolution ranges) corresponding to each node in a dendrogram. # "htree" is the cluster object (from hclust). # lres and hres are named vectors. Names must coincide with labels in the cluster object. # Check dataset numbers in tree agree with lres and hres length nres <- length(lres) if (length(hres) != nres) stop("Bad input to function treeToList: lres and hres have differing lengths") tmp <- as.vector(htree$merge) tmp <- -tmp[tmp < 0] if (length(tmp) != nres) stop("Dendrogram deals with a number of datasets differing from length of lres or hres") groups <- list() groups_resos <- list() itree <- dim(htree$merge)[1] for (i in 1:itree) { il <- htree$merge[i,1] ir <- htree$merge[i,2] if (il < 0) lab1 <- htree$labels[-il] if (ir < 0) lab2 <- htree$labels[-ir] if (il > 0) lab1 <- groups[[il]] if (ir > 0) lab2 <- groups[[ir]] lab <- c(lab1,lab2) idx <- which(names(lres) %in% lab) Lres <- max(lres[idx]) idx <- which(names(hres) %in% lab) #Hres <- min(hres[idx]) Hres <- max(hres[idx]) lab <- as.integer(lab) groups <- c(groups,list(lab)) groups_resos <- c(groups_resos,list(c(Lres,Hres))) } return(list(groups,groups_resos)) } # # Interpolate plot resolution vs signal-to-noise ratio with a 10-degrees # polynomial and get highest resolution which provides given signal-to-noise bestReso <- function(xr,yr,snr=1.5) { # Flag to warn user if intensities are weak weakFLAG <- FALSE # Modeling signal-to-noise model <- lm(yr~poly(xr,degree=10)) xp <- seq(range(xr)[1],range(xr)[2],length=10000) yp <- predict(model,list(xr=xp)) # If data are too weak, raise flag and return full reso dus <- yp[yp < snr] if (length(dus)/10000 > 0.5) { weakFLAG <- TRUE return(c(weakFLAG,1/xp[10000])) } # Move from highest to lowest resolution until snr is reached ip <- 10000 while (yp[ip] < snr) ip <- ip-1 return(c(weakFLAG,1/xp[ip])) } # # Reduce data dimensionality and standardize for cell parameters nparCell <- function(data,cn,cumprob) { # Normalize data model <- prcomp(data,scale=TRUE) smod <- summary(model) # Choose minimal number of variables that give enough statistical variability if (length(model$x[1,]) == 1) npar <- model$x if (length(model$x[1,]) > 1) { idx <- which(smod$importance[3,] > cumprob) npar <- model$x[,1:idx[1]] } npar <- as.matrix(npar) rownames(npar) <- cn return(npar) } # # f is a vector containing m sampled values of a reference function f. # g is a vector containing m sampled vaues of a function g to be scaled close to f. # This function returns the scaling constant k: new g = k X g scale_two_functions <- function(f,g) { m <- length(f) if (length(g) != m) stop("The two sampled functions need to have the same number of sampled values") # k is found as the value making sum(kg[i]-f[i])^2 a minimum fsx <- which(is.na(f)) gsx <- which(is.na(g)) f[gsx] <- NA g[fsx] <- NA f <- na.omit(f) g <- na.omit(g) k=sum(f*g)/sum(g^2) return(k) } # # Prototype to produce "npar" type matrix on which cluster analysis can be applied nparWilson <- function(listBplots,cn,nref=1,cumprob=0.95) { # Raise all plots to make them positive listBplots <- raiseWplots(listBplots) # Reference structure wref <- listBplots[[nref]] # Scale plots to line them up with reference structure for (i in 1:length(listBplots)) { tmp <- wScale(wref[[3]],listBplots[[i]][[3]]) listBplots[[i]][[3]] <- tmp } # Build data frame wpar <- matrix(listBplots[[1]][[3]],nrow=1,byrow=TRUE) for (i in 2:length(listBplots)) wpar <- rbind(wpar,matrix(listBplots[[i]][[3]],nrow=1,byrow=TRUE)) # Replace Na's if they (alas!) appear in wpar wpar <- interpolateNA(listBplots[[1]][[2]],wpar) # Principal Component Analysis to reduce dimensionality model <- prcomp(wpar,scale=TRUE) smod <- summary(model) if (length(model$x[1,]) == 1) npar <- model$x if (length(model$x[1,]) > 1) { idx <- which(smod$importance[3,] > cumprob) npar <- model$x[,1:idx[1]] } npar <- as.matrix(npar) rownames(npar) <- cn return(npar) } # # To raise plots in case parts of them are negative (used when argument of a logarithm) raiseWplots <- function(listWplots,addbit=100) { # Find plots minimum mWP <- min(listWplots[[1]][[3]],na.rm=TRUE) for (i in 2:length(listWplots)) if (min(listWplots[[i]][[3]],na.rm=TRUE) < mWP) mWP <- min(listWplots[[i]][[3]],na.rm=TRUE) # Now raise all plots (if mWP is negative) if (!is.na(mWP)) { if (mWP < 0) for (i in 1:length(listWplots)) listWplots[[i]][[3]] <- listWplots[[i]][[3]]-mWP+addbit } return(listWplots) } # # Scale a function g (g is an array) in relation to f (array with same number of points as g) # using exponential regression. Both g and f are supposed to be positive, i.e. none of the sampled # points can be zero or negative. wScale <- function(f,g) { # x and y for regression x <- 1:length(f) y <- log(g/f) # Modeling model <- lm(y~x) # Coefficients coefs <- model$coefficients attributes(coefs) <- NULL # Scale g g <- g*exp(-coefs[2]*x)/exp(coefs[1]) return(g) } # # To re-arrange file names as changed by file forR_raddam.dat so that # thay have the same order as file NEW_list_of_files.dat reArrangeFilenames <- function(filenames) { idx <- c() for (nome in filenames) { tmp <- strsplit(nome,"_") tmp <- strsplit(tmp[[1]][3],".",fixed=TRUE) idx <- c(idx,as.integer(tmp[[1]][1])) } # Build data frame and next order idxfile <- data.frame(idx=idx,file=I(filenames)) newidxfile <- idxfile[order(idxfile$idx),] return(newidxfile$file) } # Functions to calculate cell variation in angstroms .dcl <- function(a,b,c,aa,bb,gg) { # Input: cell parameters. Output: values useful to all crystallographic # calculations. These are: # 1) sa = sin(alpha), sb = sin(beta), sc = sin(gamma) # 2) ca = cos(alpha), cb = cos(beta). cc = cos(gamma) # 3) sides of reciprocal cell: ar = a*, br = b*, cr = c* # 4) sines of angles of reciprocal cell: sar = sin(alpha*), sbr = sin(beta*), scr = sin(gamma*) # 5) cosines of angles of reciprocal cell: car = cos(alpha*), cbr = cos(beta*), ccr = cos(gamma*) # 6) Volume of unit cell: V aa <- aa*pi/180 bb <- bb*pi/180 gg <- gg*pi/180 sa <- sin(aa) sb <- sin(bb) sc <- sin(gg) ca <- cos(aa) cb <- cos(bb) cc <- cos(gg) # To avoid NaN generated by rounding off errors, use for cell-derived quantities formulas # derived previously by computationa crystallographers sang <- 0.5*(aa+bb+gg) V2 <- sqrt(sin(sang-aa)*sin(sang-bb)*sin(sang-gg)*sin(sang)) V <- 2*a*b*c*V2 ar <- b*c*sa/V br <- a*c*sb/V cr <- a*b*sc/V car <- (cb*cc-ca)/(sb*sc) cbr <- (ca*cc-cb)/(sa*sc) ccr <- (ca*cb-cc)/(sa*sb) sar <- sqrt(1-car*car) sbr <- sqrt(1-cbr*cbr) scr <- sqrt(1-ccr*ccr) l <- c(sa,sb,sc,ca,cb,cc,ar,br,cr,sar,sbr,scr,car,cbr,ccr,V) names(l) <- c("SIN_ALPHA","SIN_BETA","SIN_GAMMA","COS_ALPHA","COS_BETA","COS_GAMMA","A*","B*","C*","SIN_ALPHA*","SIN_BETA*","SIN_GAMMA*", "COS_ALPHA*","COS_BETA*","COS_GAMMA*","V") return(l) } # Given cell parameters, return the inverse of cell orthogonalization matrix (First choice in Giacovazzo's book) .triclinic_to_orthogonal_01 <- function(a,b,c,aa,bb,cc) { lp <- .dcl(a,b,c,aa,bb,cc) m_1 <- matrix(c(a,0,0,b*lp[6],b*lp[3],0,c*lp[5],-c*lp[2]*lp[13],1/lp[9]),nrow=3,ncol=3,byrow=TRUE) return(m_1) } # Given cell parameters, return the inverse of cell orthogonalization matrix (MOSFLM choice, second choice in Giacovazzo's book) .triclinic_to_orthogonal_02 <- function(a,b,c,aa,bb,cc) { lp <- .dcl(a,b,c,aa,bb,cc) m_1 <- matrix(c(1/lp[7],-lp[15]/(lp[7]*lp[12]),a*lp[5],0,1/(lp[8]*lp[12]),b*lp[4],0,0,c),nrow=3,ncol=3,byrow=TRUE) return(m_1) } # Compute matrix of cross-cells max distances evaluateMaxChange <- function(maindf) { n <- length(maindf[,2]) dMat <- matrix(nrow=n,ncol=n) for (i in 1:n) { for (j in 1:n) { M1 <- .triclinic_to_orthogonal_01(maindf[i,2],maindf[i,3],maindf[i,4],maindf[i,5],maindf[i,6],maindf[i,7]) M2 <- .triclinic_to_orthogonal_01(maindf[j,2],maindf[j,3],maindf[j,4],maindf[j,5],maindf[j,6],maindf[j,7]) v1 <- M1%*%matrix(c(1,1,1),ncol=1) v2 <- M2%*%matrix(c(1,1,1),ncol=1) dd <- sqrt((v1[1,1]-v2[1,1])^2+(v1[2,1]-v2[2,1])^2+(v1[3,1]-v2[3,1])^2) dMat[i,j] <- dd } } return(dMat) } # Functions to evaluate max change in linear dimensions for cell parameters # Diagonals along 3 independent faces of unit cell faceDiagonals <- function(a,b,c,aa,bb,cc) { dab <- sqrt(a^2+b^2-2*a*b*cos(pi-cc*pi/180)) dac <- sqrt(a^2+c^2-2*a*c*cos(pi-bb*pi/180)) dbc <- sqrt(b^2+c^2-2*b*c*cos(pi-aa*pi/180)) return(c(dab,dac,dbc)) } distRatio <- function(v) { # Length of vector n <- length(v) # Create n X n matrix m <- matrix(ncol=n,nrow=n) # Double loop to fill the matrix for (i in 1:n) { for (j in 1:n) { dd <- abs(v[i]-v[j]) mv <- min(v[i],v[j]) m[i,j] <- (dd/mv)*100 } } return(m) } adistRatio <- function(v) { # Length of vector n <- length(v) # Create n X n matrix m <- matrix(ncol=n,nrow=n) # Double loop to fill the matrix for (i in 1:n) { for (j in 1:n) { m[i,j] <- abs(v[i]-v[j]) } } return(m) } # Get matrix indices get_indices <- function(N,n) { # N is the serial number and n the size of n X n matrix i <- N%%n if (i == 0) i <- n j <- N%/%n+1 if (i == n) j <- j-1 if (j > n) j <- n return(c(i,j)) } maxRatio <- function(macropar,idx) { # Cell parameters cpar <- macropar[idx,2:7] # Number of datasets n <- length(cpar[,1]) # 3 diagonal lengths for all datasets dab <- c() dac <- c() dbc <- c() for (i in 1:n) { tmp <- faceDiagonals(cpar[i,1],cpar[i,2],cpar[i,3],cpar[i,4],cpar[i,5],cpar[i,6]) dab <- c(dab,tmp[1]) dac <- c(dac,tmp[2]) dbc <- c(dbc,tmp[3]) } # Calculate maxRatio matrix for the 3 diagonals vectors and extract max value for each matrix #mab <- max(distRatio(dab)) mab <- max(adistRatio(dab)) #iab <- which(distRatio(dab) == max(distRatio(dab))) iab <- which(adistRatio(dab) == max(adistRatio(dab))) ijab <- get_indices(iab[1],n) #sab <- adistRatio(dab)[iab] sab <- distRatio(dab)[iab] #mac <- max(distRatio(dac)) mac <- max(adistRatio(dac)) #iac <- which(distRatio(dac) == max(distRatio(dac))) iac <- which(adistRatio(dac) == max(adistRatio(dac))) ijac <- get_indices(iac[1],n) #sac <- adistRatio(dac)[iac] sac <- distRatio(dac)[iac] #mbc <- max(distRatio(dbc)) mbc <- max(adistRatio(dbc)) #ibc <- which(distRatio(dbc) == max(distRatio(dbc))) ibc <- which(adistRatio(dbc) == max(adistRatio(dbc))) ijbc <- get_indices(ibc[1],n) #sbc <- adistRatio(dbc)[ibc] sbc <- distRatio(dbc)[ibc] vv <- c(sab[1],sac[1],sbc[1]) ij <- matrix(c(ijab,ijac,ijbc),ncol=3) imall <- which(c(mab,mac,mbc) == max(c(mab,mac,mbc)))[1] Mpar <- vv[imall] ijs <- ij[,imall] cns <- macropar[idx[ijs],"cn"] #return(c(max(mab,mac,mbc),Mpar,cns[1],cns[2])) return(c(Mpar,max(mab,mac,mbc),cns[1],cns[2])) } find_nodes_coords <- function(clst, clns, cn) { # Number of objects nobj <- length(clst$merge[,1]) + 1 # x coordinates for individual objects xobj <- clns[[nobj - 1]] # Go through nodes formation x <- numeric(nobj - 1) y <- numeric(nobj - 1) for (i in 1:length(clst$merge[,1])) { ele1 <- clst$merge[i,1] ele2 <- clst$merge[i,2] if (ele1 < 0 & ele2 < 0) { #idx1 <- match(abs(ele1), xobj) #idx2 <- match(abs(ele2), xobj) idx1 <- match(cn[abs(ele1)], xobj) idx2 <- match(cn[abs(ele2)], xobj) x[i] <- 0.5 * (idx1 + idx2) y[i] <- clst$height[i] } if (ele1 < 0 & ele2 > 0) { #idx1 <- match(abs(ele1), xobj) idx1 <- match(cn[abs(ele1)], xobj) idx2 <- x[ele2] x[i] <- 0.5 * (idx1 + idx2) y[i] <- clst$height[i] } if (ele1 > 0 & ele2 < 0) { idx1 <- x[ele1] #idx2 <- match(abs(ele2), xobj) idx2 <- match(cn[abs(ele2)], xobj) x[i] <- 0.5 * (idx1 + idx2) y[i] <- clst$height[i] } if (ele1 > 0 & ele2 > 0) { idx1 <- x[ele1] idx2 <- x[ele2] x[i] <- 0.5 * (idx1 + idx2) y[i] <- clst$height[i] } } return(list(x=x,y=y)) } # The following function, at present, uses a very crude and badly-thought interpolation. # It needs more investigations and proper fixing. It is used wherever prcomp is called interpolateNA <- function(x,M) { # Extract rows and columns tmp <- dim(M) m <- tmp[1] n <- tmp[2] # Find rows where there are NA's na_rows <- c() for (i in 1:m) { jfl <- 0 for (j in 1:n) { if (is.na(M[i,j])) jfl <- 1 } if (jfl == 1) na_rows <- c(na_rows,i) } # Return unchanged M if no NA's found if (length(na_rows) == 0) return(M) # Mend each "NA's-containing" row with an interpolated value for (i in na_rows) { # Find out positions of NA and their complement idx <- which(is.na(M[i,])) jdx <- which(!is.na(M[i,])) # 10-degree polynomial interpolation (normally degree 10; lower if less points available) xr <- x[jdx] yr <- M[i,jdx] dgree <- min(length(yr),11)-1 #model <- lm(yr~poly(xr,degree=10)) model <- lm(yr~poly(xr,degree=dgree)) xp <- x yp <- predict(model,list(xr=xp)) # If interpolated NA values are negative, replace with 0.1 for (j in idx) if (yp[j] < 0) yp[j] <- 0.1 # Replacing NA values in original array M[i,idx] <- yp[idx] } return(M) } #################################################################### #################################################################### ######################### Main program ############################# #################################################################### #################################################################### # To avoid warning messages set warn to a negative value options(warn = -1) # Fix global parameters here (default values) nbin <- 20 # Number of bins for dynamic and overall Wilson plots fdrop <- 0.75 # Intensity decay fraction for datasets affected by radiation damage (keyword RADFRAC) isigi <- 1.5 # Signal-to-noise ratio wanted to estimate highest resolution cut cparweight <- 1.0 # Weight to be given to cluster analysis with cell parameters as descriptors. # Wilson plots descriptors get automatically weight 1-cparweight ####################################################################################################### ####################################################################################################### ######################################## KEYWORDS FROM FILE ########################################### ####################################################################################################### ####################################################################################################### # Check whether there's a ready keyword file and use it if found if (file.exists("BLEND_KEYWORDS.dat")) { contents <- scan("BLEND_KEYWORDS.dat", what="character",sep="\n",quiet=TRUE) # Find section "BLEND KEYWORDS" idxs <- grep("BLEND KEYWORDS",contents,fixed=TRUE) idxe <- grep("POINTLESS KEYWORDS",contents,fixed=TRUE) # Only extract values if something is included in "BLEND KEYWORDS" section if ((idxe-idxs) > 1) { idxs <- idxs+1 idxe <- idxe-1 # Turn values into data frame slm <- strsplit(contents[idxs:idxe]," ") tmp <- data.frame() for (slt in slm) { jdx <- which(nchar(slt) != 0) #if (length(jdx) != 2) stop("Wrongly formatted BLEND_KEYWORDS.dat file") tmp <- rbind(tmp,data.frame(I(slt[jdx[1]]),I(slt[jdx[2]]))) # "I()" to avoid data frame turning characters into factors } for (i in 1:length(tmp[,1])) { if (as.character(tmp[i,1]) == "NBIN" | as.character(tmp[i,1]) == "nbin") nbin <- as.numeric(tmp[i,2]) if (as.character(tmp[i,1]) == "RADFRAC" | as.character(tmp[i,1]) == "radfrac" | as.character(tmp[i,1]) == "RADF" | as.character(tmp[i,1]) == "radf") fdrop <- as.numeric(tmp[i,2]) if (as.character(tmp[i,1]) == "ISIGI" | as.character(tmp[i,1]) == "isigi" | as.character(tmp[i,1]) == "ISIG" | as.character(tmp[i,1]) == "isig") isigi <- as.numeric(tmp[i,2]) if (as.character(tmp[i,1]) == "CPARWT" | as.character(tmp[i,1]) == "cparwt" | as.character(tmp[i,1]) == "CPAR" | as.character(tmp[i,1]) == "cpar") cparweight <- as.numeric(tmp[i,2]) } # Turn indices back to their initial value for following section idxs <- idxs-1 idxe <- idxe+1 } # Not all keywords can be included in existing "BLEND_KEYWORDS.dat". Complete as needed. cat("BLEND KEYWORDS\n",file="BLEND_KEYWORDS.dat") if ((idxe-idxs) > 1) { idxs <- idxs+1 idxe <- idxe-1 for (i in idxs:idxe) cat(paste(contents[i],"\n",sep=""),file="BLEND_KEYWORDS.dat",append=TRUE) nomi <- as.character(tmp[,1]) if (!("NBIN" %in% nomi) & !("nbin" %in% nomi)) { linea <- sprintf("NBIN %d\n",as.integer(nbin)) cat(linea,file="BLEND_KEYWORDS.dat",append=TRUE) } if (!("RADFRAC" %in% nomi) & !("radfrac" %in% nomi) & !("RADF" %in% nomi) & !("radf" %in% nomi)) { linea <- sprintf("RADFRAC %5.3f\n",as.numeric(fdrop)) cat(linea,file="BLEND_KEYWORDS.dat",append=TRUE) } if (!("ISIGI" %in% nomi) & !("isigi" %in% nomi) & !("ISIG" %in% nomi) & !("isig" %in% nomi)) { linea <- sprintf("ISIGI %5.3f\n",as.numeric(isigi)) cat(linea,file="BLEND_KEYWORDS.dat",append=TRUE) } if (!("CPARWT" %in% nomi) & !("cparwt" %in% nomi) & !("CPAR" %in% nomi) & !("cpar" %in% nomi)) { linea <- sprintf("CPARWT %5.3f\n",as.numeric(cparweight)) cat(linea,file="BLEND_KEYWORDS.dat",append=TRUE) } # Turn indices back to their initial value for following section idxs <- idxs-1 idxe <- idxe+1 } # If nothing was included in BLEND KEYWORDS section add everything if ((idxe-idxs) == 1) { linea <- sprintf("NBIN %d\n",as.integer(nbin)) cat(linea,file="BLEND_KEYWORDS.dat",append=TRUE) linea <- sprintf("RADFRAC %5.3f\n",as.numeric(fdrop)) cat(linea,file="BLEND_KEYWORDS.dat",append=TRUE) linea <- sprintf("ISIGI %5.3f\n",as.numeric(isigi)) cat(linea,file="BLEND_KEYWORDS.dat",append=TRUE) linea <- sprintf("CPARWT %5.3f\n",as.numeric(cparweight)) cat(linea,file="BLEND_KEYWORDS.dat",append=TRUE) } # Append rest of keywords for (i in idxe:(length(contents)-1)) cat(paste(contents[i],"\n",sep=""),file="BLEND_KEYWORDS.dat",append=TRUE) cat(contents[length(contents)],file="BLEND_KEYWORDS.dat",append=TRUE) } # If keyword file is not found write one with default words if (!file.exists("BLEND_KEYWORDS.dat")) { cat("BLEND KEYWORDS\n",file="BLEND_KEYWORDS.dat") linea <- sprintf("NBIN %d\n",as.integer(nbin)) cat(linea,file="BLEND_KEYWORDS.dat",append=TRUE) linea <- sprintf("RADFRAC %5.3f\n",as.numeric(fdrop)) cat(linea,file="BLEND_KEYWORDS.dat",append=TRUE) linea <- sprintf("ISIGI %5.3f\n",as.numeric(isigi)) cat(linea,file="BLEND_KEYWORDS.dat",append=TRUE) linea <- sprintf("CPARWT %5.3f\n",as.numeric(cparweight)) cat(linea,file="BLEND_KEYWORDS.dat",append=TRUE) # Complete with keywords sections for POINTLESS and AIMLESS cat("POINTLESS KEYWORDS\n",file="BLEND_KEYWORDS.dat",append=TRUE) cat("AIMLESS KEYWORDS\n",file="BLEND_KEYWORDS.dat",append=TRUE) } ####################################################################################################### ####################################################################################################### ################################### RADIATION DAMAGE ANALYSIS ######################################### ####################################################################################################### ####################################################################################################### # Load input file names data <- read.table(file="forR_raddam.dat") nfiles <- length(data[,1]) filenames <- as.character(data$V1) # Use ordering of file NEW_list_of_files.dat, rather than order from forR_raddam.dat filenames <- reArrangeFilenames(filenames) # Loop to get rid of radiation damaged observations # and to compute overall, relative B factor sMax <- c() sMin <- c() listData <- list() batches <- c() Ibatches <- c() Fbatches <- c() cat("Radiation damage analysis ...\n") tempWplots <- list() for (crystal in filenames) { cat(sprintf("Dealing with file %s.\n",crystal)) cat(sprintf("Considering purging data of radiation damaged elements.")) data <- read.table(crystal) data <- cleanData(data) # Check if resolution or number of images are within limits for radiation damage procedure ansR <- okRadDam(data) if (abs(fdrop) < 0.000001) ansR <- FALSE originalBatch <- max(data$V4,na.rm=TRUE) Ibatches <- c(Ibatches,range(data$V4,na.rm=TRUE)[1]) Fbatches <- c(Fbatches,range(data$V4,na.rm=TRUE)[2]) meanBC <- mean(data$V4,na.rm=TRUE) # Procedure for radiation damage detection only activated if resolution or number of images are within limits if (ansR) { wplots <- Wplots(data,nbin=nbin,m=0.125) tempWplots <- c(tempWplots,list(wplots)) coefs <- newraddam(wplots) } if (!ansR) coefs <- c(1/0,1/0) # Purging is done only when criteria are met if (is.finite(coefs[2])) { cat(sprintf(" This datasets will be purged ...")) # Calculate image cutoff for each resolution tmp <- sapply(data$V9,tfit,coefs=coefs,fdrop=fdrop)+Ibatches[length(Ibatches)] tmp2 <- range(data$V4,na.rm=TRUE) tmp[tmp > tmp2[2]] <- NA tmp[tmp < 1] <- NA # Now purge data of radiation damaged ones cond <- data$V4 > tmp idx <- which(cond) if (length(idx) > 0) data <- data[-idx,] meanAC <- mean(data$V4,na.rm=TRUE) cat(sprintf(" done.\n")) } if (!is.finite(coefs[2])) { if (ansR) cat(sprintf(" This datasets will not be purged.\n")) if (!ansR) { if (abs(fdrop) < 0.000001) cat(sprintf(" This datasets will not be purged.\n")) if (abs(fdrop) >= 0.000001) cat(sprintf(" This datasets will not be purged because of insufficient number of images or resolution too low.\n")) } meanAC <- meanBC } # Store purged datasets listData <- c(listData,list(data)) # Store resolution ranges for later sMax <- c(sMax,range(data$V9,na.rm=TRUE)[2]) sMin <- c(sMin,range(data$V9,na.rm=TRUE)[1]) # Number of last batch, after cut (approximate method at present) batches <- c(batches,floor(originalBatch-meanBC+meanAC)) } ####################################################################################################### ####################################################################################################### ###################################### DESCRIPTORS SECTION ############################################ ###################################### B FACTORS ############################################ ####################################################################################################### ####################################################################################################### # Find common resolution range for all datasets m <- max(sMin) M <- min(sMax) # Compute overall wilson plots for all datasets listBplots <- c() listWplots <- c() listSDWplots <- c() for (data in listData) { tmp <- aveIvsRes(nbin,data$V9^2,data$V7,m^2,M^2) listBplots <- c(listBplots,list(tmp)) tmp <- aveIvsRes(nbin,data$V9,data$V7) listWplots <- c(listWplots,list(tmp)) tmp <- aveIvsRes(nbin,data$V9,data$V8) listSDWplots <- c(listSDWplots,list(tmp)) } # Get rid of listData to avoid dumping redundant information in R image rm(listData) # Choose reference dataset tmp <- c() for (wplots in listBplots) tmp <- c(tmp,sum(wplots[[1]],na.rm=TRUE)) idxref_list <- tmp idxref <- which(tmp == max(tmp)) idxref <- idxref[1] ####################################################################################################### ####################################################################################################### ########################################## RESOLUTION CUTS ############################################ ####################################################################################################### ####################################################################################################### # Signal-to-noise analysis to find out optimal resolution cut for each dataset resoCuts <- c() for (i in 1:length(listWplots)) { tmp <- bestReso(listWplots[[i]][[2]],listWplots[[i]][[3]]/listSDWplots[[i]][[3]],snr=isigi) resoCuts <- c(resoCuts,tmp[2]) if (tmp[1]) { nome <- filenames[i] tmp2 <- strsplit(nome,"_") tmp2 <- strsplit(tmp2[[1]][3],".",fixed=TRUE) idx <- as.integer(tmp2[[1]][1]) messaggio <- sprintf("WARNING! Intensities of dataset %d are very weak. You might need to adjust the ISIGI parameter.\n",idx) cat(messaggio) } } # Get rid of listWplots and listSDWplots to avoid dumping redundant information in R image rm(listWplots) rm(listSDWplots) ####################################################################################################### ####################################################################################################### ###################################### DESCRIPTORS SECTION ############################################ ###################################### CELL PARAMETERS ############################################ ####################################################################################################### ####################################################################################################### # Load data from file "forR_macropar.dat" macropar <- read.table("forR_macropar.dat") names(macropar) <- c("cn","a","b","c","alpha","beta","gamma","mosa","ctoddist","wlength") # Re-arrange macropar with same order as file NEW_list_of_files.dat macropar <- macropar[order(macropar$cn),] # Prepare data frame for cluster analysis cat("Preparing data for cluster analysis ...\n") maindf <- cbind(macropar[,1:7],data.frame(batch=batches,Ibatch=Ibatches,Fbatch=Fbatches)) # Decide which macroparameters can be used in cluster analysis cell_columns <- c() if (sum(is.na(maindf$a)) == 0) { tmp <- sd(maindf$a) if (tmp > 0.000001) cell_columns <- c(cell_columns,2) } if (sum(is.na(maindf$b)) == 0) { tmp <- sd(maindf$b) if (tmp > 0.000001) cell_columns <- c(cell_columns,3) } if (sum(is.na(maindf$c)) == 0) { tmp <- sd(maindf$c) if (tmp > 0.000001) cell_columns <- c(cell_columns,4) } if (sum(is.na(maindf$alpha)) == 0) { tmp <- sd(maindf$alpha) if (tmp > 0.000001) cell_columns <- c(cell_columns,5) } if (sum(is.na(maindf$beta)) == 0) { tmp <- sd(maindf$beta) if (tmp > 0.000001) cell_columns <- c(cell_columns,6) } if (sum(is.na(maindf$gamma)) == 0) { tmp <- sd(maindf$gamma) if (tmp > 0.000001) cell_columns <- c(cell_columns,7) } # Full list of descriptors used fullc <- cell_columns # Name lowresos and highresos arrays with same number as original dataset lowresos <- 1/sMin highresos <- resoCuts names(lowresos) <- maindf$cn names(highresos) <- maindf$cn # New set of descriptors that better exploit information in Wilson plots nparW <- nparWilson(listBplots,maindf$cn,nref=idxref,cumprob=0.95) # Get rid of listBplots to avoid dumping redundant information in R image rm(listBplots) ####################################################################################################### ####################################################################################################### ########################################## CLUSTER ANALYSIS ########################################### ####################################################################################################### ####################################################################################################### # Enough information to start cluster analysis (add bit concerning cluster analysis here) cat("Cluster analysis initiated ...\n") # Carry on cluster analysis with descriptors from Wilson plots nparW.dist <- dist(nparW) # Reduce data dimensionality and standardize if (length(fullc) != 0) { nparC <- nparCell(maindf[,fullc],maindf$cn,cumprob=0.95) # Build distance matrix for cluster analysis (2 methods) nparC.dist <- dist(nparC) } # Final distance matrix has contributions from both set of descriptors if (length(fullc) == 0) distAll <- nparW.dist if (length(fullc) != 0) distAll <- cparweight*nparC.dist+(1-cparweight)*nparW.dist # Cluster analysis tmp <- R.Version() if (as.numeric(tmp[[6]]) >= 3 & as.numeric(tmp[[7]]) >= 1) { npar.hc_ward <- hclust(distAll,method="ward.D") } else { npar.hc_ward <- hclust(distAll,method="ward") } # Dendrogram and resolution information are stored in list groups groups <- treeToList(npar.hc_ward,lowresos,highresos) # Calculate LCV values for all nodes of dendrogram LCV_values <- c() aLCV_values <- c() LCV_couples <- matrix(nrow=length(groups[[1]]),ncol=2) icpls <- 0 for (cn in groups[[1]]) { icpls <- icpls+1 idx <- match(cn,macropar$cn) #tmp <- maxRatio(macropar[idx,2:7]) tmp <- maxRatio(macropar,idx) LCV_values <- c(LCV_values,tmp[1]) aLCV_values <- c(aLCV_values,tmp[2]) minV <- min(tmp[3],tmp[4]) maxV <- max(tmp[3],tmp[4]) LCV_couples[icpls,1] <- as.integer(minV) LCV_couples[icpls,2] <- as.integer(maxV) } # Output merging nodes table to an ascii file nTable <- "./CLUSTERS.txt" if (file.exists(nTable)) emptyc <- file.remove(nTable) linea <- " \n" cat(linea,file=nTable) linea <- " Cluster Number of Cluster LCV aLCV Furthest Datasets\n" cat(linea,file=nTable,append=TRUE) linea <- " Number Datasets Height Datasets ID\n" cat(linea,file=nTable,append=TRUE) linea <- " \n" cat(linea,file=nTable,append=TRUE) for (i in 1:length(groups[[1]])) { sorted_groups <- sort(groups[[1]][[i]]) linea <- paste(sprintf(" %03d %3d %7.3f %7.2f %9.2f %5d %4d ", i,length(groups[[1]][[i]]),npar.hc_ward$height[i],LCV_values[i],aLCV_values[i],LCV_couples[i,1],LCV_couples[i,2])," ", paste(sorted_groups,collapse=" "),"\n",sep="") cat(linea,file=nTable,append=TRUE) } # Print dendrogram with LCV values annotated (in PNG and POSTSCRIPT formats) VV <- c() for (i in 1:length(maindf[,1])) { tmp <- .dcl(maindf[i,2],maindf[i,3],maindf[i,4],maindf[i,5],maindf[i,6],maindf[i,7])[16] attributes(tmp) <- NULL VV <- c(VV,tmp) } kk <- LCV_values[length(groups[[1]])] akk <- aLCV_values[length(groups[[1]])] tmsg <- sprintf("LCV: %.2f%s (absolute LCV: %.2f",kk,"%",akk) msg <- bquote(.(tmsg) ~ ring(A)*")") if (length(npar.hc_ward$height) > 1) { png(file="./tree.png",height=1000,width=1000) #plclust(npar.hc_ward,xlab="Individual datasets",ylab="Ward distance",main=msg,sub="") par(oma=c(0,0.5,0,0.5),mgp=c(1.5,0.5,0)) plot(npar.hc_ward,xlab="Individual datasets",ylab="Ward distance",main=msg,sub="",col.main="red",cex.main=2,col.lab="blue",cex.lab=2) nodesxy <- find_nodes_coords(npar.hc_ward,groups[[1]],macropar$cn) if (length(LCV_values) > 5) idx <- (length(LCV_values)-4):(length(LCV_values)-1) if (length(LCV_values) <= 5) idx <- 1:(length(LCV_values)-1) labelsxy <- c() for (i in 1:length(LCV_values)) { stmp <- sprintf("%6.2f(%.2f)",LCV_values[i],aLCV_values[i]) labelsxy <- c(labelsxy,stmp) } text(nodesxy$x[idx],nodesxy$y[idx],labels=labelsxy[idx],adj=c(0,-0.5),col="red",cex=1.0) emptyc <- dev.off() # The "emptyc" is to collect the return value of dev.off(), so that it's not output #postscript(file="./tree.ps", height = 10, width = 10, paper = "a4") postscript(file="./tree.ps",paper = "a4",horizontal=FALSE) #plclust(npar.hc_ward,xlab="Individual datasets",ylab="Ward distance",main=msg,sub="") par(oma=c(0,0.5,0,0.5),mgp=c(1.5,0.5,0)) plot(npar.hc_ward,xlab="Individual datasets",ylab="Ward distance",main=msg,sub="",col.main="red",cex.main=2,col.lab="blue",cex.lab=2) text(nodesxy$x[idx],nodesxy$y[idx],labels=labelsxy[idx],adj=c(0,-0.2),col="red",cex=0.8) emptyc <- dev.off() # The "emptyc" is to collect the return value of dev.off(), so that it's not output } if (length(npar.hc_ward$height) == 1) cat("WARNING! No plot of dendrogram available as there is only 1 node. For cluster height refer to file 'CLUSTERS.txt'\n") cat("Cluster analysis completed!\n") # Remove "refs_*_*.dat" files and "forR_raddam.dat" file for (crystal in filenames) { if (file.exists(crystal)) emptyc <- file.remove(crystal) # The "emptyc" is to collect the return value of file.remove(), so that it's not output } if (file.exists("forR_raddam.dat")) emptyc <- file.remove("forR_raddam.dat") if (file.exists("forR_macropar.dat")) emptyc <- file.remove("forR_macropar.dat") # Write file with batch information and original crystal number filenames <- read.table("./NEW_list_of_files.dat",as.is=c(1)) emptyc <- file.remove("./NEW_list_of_files.dat") for (ii in 1:length(filenames[,1])) { stmp <- normalizePath(filenames$V1[ii]) filenames$V1[ii] <- stmp } if (file.exists("FINAL_list_of_files.dat")) emptyc <- file.remove("FINAL_list_of_files.dat") idx <- match(1:length(filenames[,1]),maindf$cn) for (i in 1:length(filenames[,1])) { if (!is.na(idx[i])) { linea <- sprintf("%-130s %5.0f %5.0f %5.0f %5.0f %6.3f\n",filenames[i,],maindf$cn[idx[i]], maindf$batch[idx[i]],maindf$Ibatch[idx[i]],maindf$Fbatch[idx[i]],highresos[idx[i]]) } if (is.na(idx[i])) { linea <- sprintf("%-130s %5.0f %5.0f %5.0f %5.0f %6.3f\n",filenames[i,],NA,NA,NA,NA,NA) } cat(linea,file="./FINAL_list_of_files.dat",append=T) } # Output message for user #cat(" \n") #cat("##################################################################\n") #cat("##################################################################\n") #cat("##################################################################\n") #cat(" The following files have been produced:\n") #cat("mtz_names.dat : ascii file containing path of input files used cwfor the analysis;\n") #cat("BLEND_SUMMARY.txt : ascii file containing a summary table with features of the datasets analysed;\n") #cat("FINAL_list_of_files.dat : ascii file with the path, serial number and number of batch used for each dataset;\n") #cat("tree.png : PNG format picture of clustering dendrogram;\n") #cat("CLUSTERS.txt : ascii file providing details of clustering.\n") #cat("\n =========>>> Please, analyse the dendrogram and run BLEND in synthesis mode to produce merged datasets.\n") #cat(" \n") # Save image for next run of BLEND save.image(file="BLEND.RData") # Exit without saving q(save = "no", status = 0, runLast = FALSE)