function [beta,li,alli]=Mregsvd(x,y,ivalid,ngene,nfac,nmc,cvi) %===================================================================== % % Fits full posterior in the linear regression model with Bayesian stochastic regularisation % using iterative similation MCMC methods % % % INPUTS: % x is the expression data to be used -- genes are rows, arrays are columns % y is response % ivalid is a vector of array numbers of validation cases % ngene is the number of genes to be selected -- those most highly correlated with % outcome on the training arrays % nfac is the number of supergene factors to include, the rest are ignored % nmc is a 3 vector: % nit is the number of iterates to summarise the MCMC % nbi is the initial number to burn-in and discard % nskip is the number of skips between saves % cvi is 1 if you want '1 at a time' cross-validation analyses too, 0 if not % % OUTPUTS: % beta is the estimated regression vector for genes % li the top ngene genes selected in the training set analysis % alli the indices of all genes selected in the CV optimisations % % data & parameters [N,n]=size(x); nvalid=length(ivalid); itrain=1:n; itrain(ivalid)=[]; ntrain=length(itrain); k=2; nit=nmc(1); nbi=nmc(2); nskip=nmc(3); % reduce to top most correlated ngene ... Y=y; Y(ivalid)=[]; X=x; X(:,ivalid)=[]; ind=select_genes_cor(X,Y,ngene); li=ind(1:ngene); alli=[]; 'running optimised training set analysis ' X=x(li,:)-repmat(mean(x(li,:),2),1,n); [A,D,F]=svd_mw(X); D=D'; [p,n]=size(F); j=1:min(min(ngene,nfac),p); A=A(:,j); D=D(j); F=F(j,:); [p,n]=size(F); ln=1/sqrt(n); F=([ln*ones(1,n);F]); D=[sqrt(n);D]; D2=D.*D; p=p+1; gamma=zeros(p,1); tau2=ones(p,1); sigma2=var(y); sigma=sqrt(sigma2); vi=tau2./(sigma2+tau2); my=F'*gamma; m=[]; fit=[]; data=reshape(y,n,1); for j=1:(nbi+nskip*nit) if (nvalid>0), data(ivalid)=my(ivalid)+sigma*randn(nvalid,1); end gamma=vi.*(F*data)+sqrt(vi.*sigma).*randn(p,1); my=F'*gamma; e=data-my; sigma2=1/gamrnd(n/2,2/(e'*e)); sigma=sqrt(sigma2); tau2=1./gamrnd((k+1)/2,2./(k+gamma.^2),p,1); vi=tau2./(sigma2+tau2); if (j>nbi & floor((j-nbi)/nskip)==(j-nbi)/nskip) fit=[fit,my]; m=[m,gamma]; end; if (floor(j/1000)==j/1000) j end; end; gamma=mean(m,2); gamma=gamma(2:p); beta=A*gamma; F=F(2:p,:); p=p-1; figure(2); set(gca,'FontSize',14); subplot(2,1,1); plot(gamma,'--bs','MarkerSize',6,'LineWidth',1.0); hold on; title('Expression weights: Metagenes'); xlabel('Metagene'); ylabel('gamma') plot([0 p],[0 0],'black--','LineWidth',1); hold off subplot(2,1,2); plot(beta,'LineWidth',1.0); hold on; plot([0 ngene],[0 0],'black--','LineWidth',1.0); hold off title('Expression weights: Genes') xlabel('Gene'); ylabel('beta') hold off; figure(4); clf; set(gca,'FontSize',14); mfit=mean(fit')'; sl=prctile(fit',5)'; su=prctile(fit',95)'; scatter(mfit,y,'k'); hold on scatter(mfit(ivalid),y(ivalid),'r'); hold on % a= [ min([min(mfit),min(y)]), max([max(mfit),max(y)]) ]; line(a,a); hold on scatter(mfit,mfit,'b*'); hold on; for j=itrain plot([mfit(j) mfit(j)],[sl(j),su(j)],'b:','LineWidth',2.0) end; for j=ivalid plot([mfit(j) mfit(j)],[sl(j),su(j)],'r:','LineWidth',2.0) end; ylabel('Outcome'); xlabel('Predicted values'); hold off; li=reshape(li,ngene,1); if (cvi==1) %%% training cases ... [N,n]=size(x); fitcv=fit; for ij=1:ntrain ix=itrain(ij); ['running analysis ',int2str(ix) ] % reduce to most correlated ngene ignoring the hold out and validation cases iv=sort(unique([ix ivalid])); Y=y; Y(iv)=[]; X=x; X(:,iv)=[]; s=[]; ind=select_genes_cor(X,Y,ngene); si=ind(1:ngene); alli=unique([alli si]); X=x(si,:)-repmat(mean(x(si,:),2),1,n); [A,D,F]=svd_mw(X); D=D'; [p,n]=size(F); j=1:min(min(ngene,nfac),p); A=A(:,j); D=D(j); F=F(j,:); [p,n]=size(F); F=([ln*ones(1,n);F]); D=[sqrt(n);D]; D2=D.*D; p=p+1; gamma=zeros(p,1); tau2=ones(p,1); sigma2=var(y); sigma=sqrt(sigma2); vi=tau2./(sigma2+tau2); my=F'*gamma; fit=[]; data=reshape(y,n,1); for j=1:(nbi+nskip*nit) data(iv)=my(iv)+sigma*randn(nvalid+1,1); gamma=vi.*(F*data)+sqrt(vi.*sigma).*randn(p,1); my=F'*gamma; tau2=1./gamrnd((k+1)/2,2./(k+gamma.^2),p,1); vi=tau2./(sigma2+tau2); if (j>nbi & floor((j-nbi)/nskip)==(j-nbi)/nskip) fit=[fit, my(ix)]; end; if (floor(j/1000)==j/1000) j end; end; fitcv(ix,:)=fit; end; figure(5); clf; set(gca,'FontSize',14); mfit=mean(fitcv')'; sl=prctile(fitcv',5)'; su=prctile(fitcv',95)'; scatter(mfit(itrain),y(itrain),'k'); hold on % a=[min([y,mfit']), max([y,mfit'])]; line(a,a); hold on scatter(mfit(itrain),mfit(itrain),'b*'); hold on; for j=itrain plot([mfit(j) mfit(j)],[sl(j),su(j)],'b:','LineWidth',2.0) end; title('CV predicted outcomes') ylabel('Outcome'); xlabel('Predicted values'); hold off; % end; alli=reshape(alli,length(alli),1);