00001
00002 #include <stdio.h>
00003 #include <string.h>
00004 #include <ctype.h>
00005 #include <stdlib.h>
00006 #include <math.h>
00007
00008 #define MAXNAMES 50
00009 #define MAXSEQ 50
00010 #define MAXLEN 1000
00011 #define MAXLINE 2048
00012
00013 #define Boolean short
00014 #define TRUE 1
00015 #define FALSE 0
00016 #define EOS '\0'
00017
00018 #define INT_SCALE_FACTOR 100
00019 #define NUMRES 32
00020
00021 #define LEFT 1
00022 #define RIGHT 2
00023
00024 #define NODE 0
00025 #define LEAF 1
00026
00027
00028 typedef struct node {
00029 struct node *left;
00030 struct node *right;
00031 struct node *parent;
00032 float dist;
00033 int leaf;
00034 int order;
00035 char name[64];
00036 } stree, *treeptr;
00037
00038
00039 void guide_tree(FILE *tree,int nseqs);
00040
00041 int SeqGCGCheckSum(char *seq, int len);
00042
00043
00044 void make_colscores(void);
00045 Boolean keyword(char *line,char *code);
00046 int getargs(char *line,char *args[],int max);
00047 int parse_repart(char *str);
00048 Boolean blankline(char *line);
00049 void readmsf(FILE *fin);
00050 void readref(FILE *fin);
00051 void score_ref(void);
00052 int countmsf(FILE *fin);
00053 void columnscore(Boolean verbose);
00054 void readref(FILE *fin);
00055 void read_annotation(FILE *fin);
00056 int countref(FILE *ifd);
00057 void code_refseq(int i);
00058 void code_seq(int i);
00059 void aln_seq(int i);
00060 void ref_gaps(int cutoff);
00061
00062
00063 char refnames[MAXSEQ][MAXNAMES+1];
00064 char refseq_array[MAXSEQ][MAXLEN];
00065 int refseq_code[MAXSEQ][MAXLEN];
00066 int refseq_col[MAXLEN];
00067 int refseqlength;
00068 int refnseqs;
00069
00070
00071 char names[MAXSEQ][MAXNAMES+1];
00072 char seq_array[MAXSEQ][MAXLEN];
00073 int seq_code[MAXSEQ][MAXLEN];
00074 int seq_aln[MAXSEQ][MAXLEN];
00075 int seqlength;
00076 int seq_xref[MAXSEQ];
00077 int nseqs;
00078
00079 Boolean verbose;
00080
00081 int refscore;
00082
00083 int gap[MAXLEN];
00084
00085
00086 int cutoff;
00087
00088
00089
00090 int nblocks;
00091 typedef struct Block {
00092 int s;
00093 int e;
00094 int nseqs;
00095 } Block;
00096
00097 Block blocks[100];
00098
00099
00100
00101 int tc_score;
00102 float maxpc_res;
00103 float pc_res;
00104
00105 int main(int argc, char **argv)
00106 {
00107 FILE *ifd,*tfd,*afd;
00108 int err,i,j,ires,iseq=0;
00109 int ix;
00110 char t[MAXLINE+1];
00111 char seq[MAXLINE+1];
00112 char clen[MAXLINE+1];
00113 char cvar[MAXLINE+1];
00114 char cprog[MAXLINE+1];
00115 char method;
00116 Boolean eof,found;
00117
00118 if(argc<3) {
00119 fprintf(stderr,"Usage: %s ref_aln test_aln [annot_file] [-v]\n",argv[0]);
00120 fprintf(stderr," where ref_aln reference alignment in msf format \n");
00121 fprintf(stderr," test_aln test alignment in msf format \n");
00122 fprintf(stderr," annot_file optional BAliBASE annotation file\n");
00123 fprintf(stderr," -v verbose mode\n");
00124 return 1;
00125 }
00126
00127
00128 if((ifd=fopen(argv[1],"r"))==NULL) {
00129 fprintf(stderr,"Cannot open reference aln file [%s]",argv[1]);
00130 return 1;
00131 }
00132
00133
00134 if((tfd=fopen(argv[2],"r"))==NULL) {
00135 fprintf(stderr,"Cannot open test aln file [%s]",argv[2]);
00136 return 1;
00137 }
00138
00139
00140
00141
00142
00143
00144 verbose=FALSE;
00145 method='C';
00146 if(argc>=4)
00147 {
00148 if(argv[3][0]=='-')
00149 verbose=TRUE;
00150 else
00151 {
00152 if((afd=fopen(argv[3],"r"))==NULL) {
00153 fprintf(stderr,"Cannot open annotation file [%s]",argv[3]);
00154 return 1;
00155 }
00156 method='B';
00157 if(argc>=5) verbose=TRUE;
00158 }
00159 }
00160
00161
00162 refnseqs=countmsf(ifd);
00163 if(refnseqs==0) {
00164 fprintf(stderr,"Error: no sequences in %s\n",argv[1]);
00165 return 1;
00166 }
00167 readref(ifd);
00168
00169
00170 nseqs = countmsf(tfd);
00171 if(nseqs==0) {
00172 fprintf(stderr,"Error: no sequences in %s\n",argv[2]);
00173 return 1;
00174 }
00175 readmsf(tfd);
00176
00177 if(nseqs != refnseqs) {
00178 fprintf(stderr,"Error: %d sequences in %s and %d in %s",refnseqs,argv[1],nseqs,argv[2]);
00179 return 1;
00180 }
00181
00182
00183
00184
00185 if(method=='C')
00186 {
00187 cutoff=(float)refnseqs*20.0/100.0;
00188 if(cutoff<1) cutoff=1;
00189 ref_gaps(cutoff);
00190 }
00191
00192
00193
00194 for(i=0;i<refnseqs;i++)
00195 seq_xref[i]=-1;
00196 for(i=0;i<refnseqs;i++)
00197 {
00198 found=FALSE;
00199 for(j=0;j<nseqs;j++)
00200 {
00201 if(strcasecmp(names[j],refnames[i])==0)
00202 {
00203 found=TRUE;
00204 seq_xref[j]=i;
00205 break;
00206 }
00207 }
00208 if(found==FALSE) {
00209 fprintf(stderr,"Error: sequence %s not found in test aln %s\n",refnames[i],argv[2]);
00210 return 1;
00211 }
00212 }
00213
00214 fprintf(stdout,"\nComparing test alignment in %s\nwith reference alignment in %s\n",argv[2],argv[1]);
00215 if(method=='B')
00216 fprintf(stdout,"\nUsing core blocks defined in %s\n",argv[3]);
00217
00218
00219
00220 for(i=0;i<refnseqs;i++)
00221 code_refseq(i);
00222
00223
00224 if(method=='B')
00225 {
00226 read_annotation(afd);
00227 }
00228
00229
00230 score_ref();
00231
00232
00233
00234 for(i=0;i<nseqs;i++)
00235 if ( seq_xref[i] != -1) code_seq(i);
00236
00237
00238 columnscore(verbose);
00239 pc_res/=maxpc_res;
00240
00241 fprintf(stdout,"\n\tSP score= %.3f\n",pc_res);
00242 fprintf(stdout,"\n\tTC score= %.3f\n",(float)tc_score/100.0);
00243
00244
00245 }
00246
00247
00248
00249
00250 void code_refseq(int seq)
00251 {
00252 int i,j;
00253
00254
00255
00256 for(i=0;i<refseqlength;i++)
00257 {
00258 if(refseq_array[seq][i]=='-')
00259 {
00260 refseq_code[seq][i]=0;
00261 }
00262 else
00263 {
00264 refseq_code[seq][i]=i+1;
00265 }
00266 }
00267 }
00268
00269
00270
00271
00272 void ref_gaps(int cutoff)
00273 {
00274 int i,j;
00275
00276
00277
00278
00279 for(i=0;i<refseqlength;i++)
00280 {
00281 gap[i]=0;
00282 for(j=0;j<refnseqs;j++)
00283 if(refseq_array[j][i]=='-')
00284 {
00285 gap[i]++;
00286 }
00287 }
00288
00289
00290 for(i=0;i<refseqlength;i++)
00291 {
00292 if(gap[i]>=cutoff) refseq_col[i]=0;
00293 else refseq_col[i]=refnseqs-gap[i];
00294 }
00295 }
00296
00297 void score_ref(void)
00298 {
00299 int i,j;
00300
00301
00302 maxpc_res=0;
00303 for(i=0;i<refseqlength;i++)
00304 {
00305 if(refseq_col[i]>1)
00306 {
00307 maxpc_res+=refseq_col[i]*(refseq_col[i]-1)/2.0;
00308 }
00309 }
00310 }
00311
00312 void code_seq(int seq)
00313 {
00314 int i,j,ix;
00315
00316
00317 ix=0;
00318 for(j=0;j<refseqlength;j++)
00319 if(refseq_array[seq_xref[seq]][j]!='-')
00320 {
00321 ix=refseq_code[seq_xref[seq]][j];
00322 break;
00323 }
00324
00325 for(i=0;i<seqlength;i++)
00326 {
00327 if(seq_array[seq][i]=='-')
00328 {
00329 seq_code[seq_xref[seq]][i]=0;
00330 }
00331 else
00332 {
00333
00334
00335 if (refseq_col[ix-1]>0 && refseq_array[seq_xref[seq]][ix-1]!='-') seq_code[seq_xref[seq]][i]=ix;
00336 for(j+=1;j<refseqlength;j++)
00337 if(refseq_array[seq_xref[seq]][j]!='-')
00338 {
00339 ix=refseq_code[seq_xref[seq]][j];
00340 break;
00341 }
00342 }
00343 }
00344 }
00345
00346 void columnscore(Boolean verbose)
00347 {
00348 int i,j,k;
00349 int iseq,ncols;
00350 int scores[MAXLEN][MAXSEQ];
00351 int index[MAXSEQ];
00352 int n;
00353 int colscore1[MAXLEN];
00354 int pc;
00355 int nblocks;
00356 int blen=50;
00357 Boolean found;
00358
00359 tc_score=ncols=0;
00360 pc_res=0;
00361 for(j=0;j<seqlength;j++)
00362 colscore1[j]=0;
00363
00364
00365 for(i=0;i<seqlength;i++)
00366 {
00367 for(j=0;j<nseqs;j++)
00368 scores[i][j]=0;
00369 n=0;
00370 for(j=0;j<nseqs;j++)
00371 {
00372 if(seq_code[j][i]!=0)
00373 {
00374 found=FALSE;
00375 for(k=0;k<n;k++)
00376 {
00377 if(seq_code[j][i]==index[k])
00378 {
00379 scores[i][k]++;
00380 found=TRUE;
00381 break;
00382 }
00383 }
00384 if(found==FALSE)
00385 {
00386 scores[i][n]=1;
00387 index[n]=seq_code[j][i];
00388 n++;
00389 }
00390 }
00391 }
00392 for(j=0;j<nseqs;j++)
00393 {
00394 if(scores[i][j]>1) {
00395 if(scores[i][j]>refseq_col[i]) scores[i][j]=refseq_col[i];
00396 pc_res+=scores[i][j]*(scores[i][j]-1)/2.0;
00397 }
00398 }
00399
00400 if(seq_code[0][i]>0 && scores[i][0]>=refseq_col[seq_code[0][i]-1])
00401 colscore1[i]=1;
00402 if (seq_code[0][i]!=0) ncols++;
00403
00404 tc_score+=colscore1[i];
00405 }
00406
00407 if(verbose)
00408 {
00409 nblocks=seqlength/blen;
00410 fprintf(stdout,"\n\n");
00411 for(k=0;k<=nblocks;k++) {
00412 for(i=0;i<nseqs;i++) {
00413 fprintf(stdout,"%20s ",names[i]);
00414 for(j=0;j<blen && k*blen+j<seqlength;j++) {
00415 fprintf(stdout,"%c",seq_array[i][k*blen+j]);
00416 }
00417 fprintf(stdout,"\n");
00418 }
00419 fprintf(stdout,"\n");
00420 fprintf(stdout,"\n ");
00421 for(j=0;j<blen && k*blen+j<seqlength;j++) {
00422 if(seq_code[0][k*blen+j]!=0)
00423 fprintf(stdout,"%d",colscore1[k*blen+j]);
00424
00425 else
00426 fprintf(stdout,".");
00427 }
00428 fprintf(stdout,"\n\n");
00429 }
00430
00431 }
00432
00433
00434 tc_score=100*(float)tc_score/(float)ncols;
00435 }
00436
00437
00438 void code_blockseq(int seq)
00439 {
00440 int i,j;
00441
00442 for(i=0,j=1;i<seqlength;i++)
00443 {
00444 if(seq_array[seq][i]=='-')
00445 {
00446 seq_code[seq][i]=0;
00447 }
00448 else
00449 {
00450 seq_code[seq][i]=j++;
00451 }
00452 }
00453 }
00454
00455
00456
00457 void read_annotation(FILE *fin)
00458 {
00459 int i,j,k,s,e;
00460 char c,line[MAXLINE+1];
00461 char t[MAXLINE+1];
00462 int numargs;
00463 char *args[101];
00464 Boolean g;
00465
00466 nblocks=0;
00467 while(fgets(line,MAXLINE+1,fin)) {
00468 if(keyword(line,"BPOS")) {
00469 nblocks=0;
00470 numargs = getargs(line,args,101);
00471 for(i=1;i<numargs;i++)
00472 {
00473 s=sscanf(args[i],"%d%c%d",&blocks[nblocks].s,&c,&blocks[nblocks].e);
00474 if(s==1) {
00475 blocks[nblocks].e=blocks[nblocks].s;
00476 }
00477 else if(s!=3) {
00478 break;
00479 }
00480 blocks[nblocks].nseqs=refnseqs;
00481 nblocks++;
00482 }
00483 break;
00484 }
00485 }
00486
00487 while(fgets(line,MAXLINE+1,fin)) {
00488 if(keyword(line,"BREPART")) {
00489 numargs = getargs(line,args,101);
00490 for(i=1;i<numargs;i++)
00491 {
00492 blocks[i-1].nseqs = 0;
00493 sscanf(args[i],"%s",t);
00494 blocks[i-1].nseqs = parse_repart(t);
00495 }
00496 }
00497 }
00498
00499 for(i=0;i<refseqlength;i++)
00500 refseq_col[i]=0;
00501
00502 for(i=0;i<nblocks;i++) {
00503 if(blocks[i].nseqs > 0) {
00504 for(j=0, k=0;j<refseqlength;j++)
00505 {
00506 if(isalpha(refseq_array[0][j]))
00507 k++;
00508 if(k==blocks[i].s) {
00509 s=j;
00510 break;
00511 }
00512 }
00513 for(;j<refseqlength;j++)
00514 {
00515 if(isalpha(refseq_array[0][j]))
00516 k++;
00517 if(k==blocks[i].e) {
00518 e=j+1;
00519 break;
00520 }
00521 }
00522 for(j=s;j<=e;j++) {
00523 refseq_col[j]=0;
00524 for(k=0;k<blocks[i].nseqs;k++)
00525 if(isalpha(refseq_array[k][j])) refseq_col[j]++;
00526 }
00527 }
00528 }
00529
00530 }
00531
00532 Boolean keyword(char *line,char *code)
00533 {
00534 int i;
00535 char key[MAXLINE];
00536
00537 for(i=0;!isspace(line[i]) && line[i]!=EOS;i++)
00538 key[i]=line[i];
00539 key[i]=EOS;
00540 return( strcmp(key,code) == 0 );
00541 }
00542
00543 int getargs(char *line,char *args[],int max)
00544 {
00545
00546 char *inptr;
00547 int i;
00548
00549 inptr=line;
00550 for (i=0;i<=max;i++)
00551 {
00552 if ((args[i]=strtok(inptr," \t\n"))==NULL)
00553 break;
00554 inptr=NULL;
00555 }
00556 if (i>max)
00557 {
00558 fprintf(stdout,"Too many args\n");
00559 return(0);
00560 }
00561
00562 return(i);
00563 }
00564
00565 int parse_repart(char *str)
00566 {
00567 int len,s,e,i,ix,tot;
00568 char t[MAXLINE+1];
00569
00570
00571 tot=0;
00572 len=strlen(str);
00573 for(ix=0;ix<len;)
00574 {
00575 for(;ix<len && !isalnum(str[ix]);ix++);
00576 s=ix;
00577 for(;ix<len && str[ix]!='+';ix++);
00578 e=ix;
00579 for(i=s;i<e;i++)
00580 t[i-s]=str[i];
00581 t[i-s]='\0';
00582
00583 if(toupper(t[0])=='H') {
00584 sscanf(&t[1],"%d",&i);
00585 if(i>0 && i<=refnseqs) tot += i;
00586 else fprintf(stderr,"WARNING: bad format in repartition %s\n",str);
00587 }
00588 else {
00589 sscanf(t,"%d",&i);
00590 if(i>0 && i<=refnseqs) tot += i;
00591 else fprintf(stderr,"WARNING: bad format in repartition %s\n",str);
00592 }
00593 }
00594
00595 return tot;
00596 }
00597
00598 Boolean blankline(char *line)
00599 {
00600 int i;
00601
00602 for(i=0;line[i]!='\n' && line[i]!=EOS;i++) {
00603 if( isdigit(line[i]) ||
00604 isspace(line[i]) ||
00605 (line[i] == '*') ||
00606 (line[i] == ':') ||
00607 (line[i] == '.'))
00608 ;
00609 else
00610 return FALSE;
00611 }
00612 return TRUE;
00613 }
00614
00615 void readref(FILE *fin)
00616 {
00617 static char line[MAXLINE+1];
00618 char seq[MAXLEN+1];
00619 int len,seqno,i,j,k;
00620 unsigned char c;
00621
00622 for(seqno=0;seqno<refnseqs;seqno++)
00623 {
00624
00625 fseek(fin,0,0);
00626
00627 len=0;
00628 for(i=0;;i++) {
00629 if(fgets(line,MAXLINE+1,fin)==NULL) return;
00630 if(line[0]=='/' && line[1]=='/') break;
00631
00632 }
00633
00634 while (fgets(line,MAXLINE+1,fin) != NULL) {
00635 if(!blankline(line)) {
00636
00637 for(i=0;i<seqno;i++) fgets(line,MAXLINE+1,fin);
00638 for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break;
00639 for(k=j;k<=strlen(line);k++) if(line[k] == ' ') break;
00640 strncpy(refnames[seqno],line+j,k-j);
00641 refnames[seqno][k-j]='\0';
00642 refnames[seqno][MAXNAMES]='\0';
00643
00644 for(i=k;i<=MAXLINE;i++) {
00645 c=line[i];
00646 if(c == '.' || c == '~' ) c = '-';
00647 if(c == '*') c = 'X';
00648 if(c == '\n' || c == EOS) break;
00649 if(isalpha(c) || c=='-') seq[len++]=c;
00650 }
00651
00652 for(i=0;;i++) {
00653 if(fgets(line,MAXLINE+1,fin)==NULL) break;
00654 if(blankline(line)) break;
00655 }
00656 }
00657 }
00658 strcpy(refseq_array[seqno],seq);
00659 if(seqno==0) refseqlength=len;
00660 }
00661 }
00662
00663 void readmsf(FILE *fin)
00664 {
00665 static char line[MAXLINE+1];
00666 char seq[MAXLEN+1];
00667 int len,seqno,i,j,k;
00668 unsigned char c;
00669
00670 for(seqno=0;seqno<nseqs;seqno++)
00671 {
00672
00673 fseek(fin,0,0);
00674
00675 len=0;
00676 for(i=0;;i++) {
00677 if(fgets(line,MAXLINE+1,fin)==NULL) return;
00678 if(line[0]=='/' && line[1]=='/') break;
00679
00680 }
00681
00682 while (fgets(line,MAXLINE+1,fin) != NULL) {
00683 if(!blankline(line)) {
00684
00685 for(i=0;i<seqno;i++) fgets(line,MAXLINE+1,fin);
00686 for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break;
00687 for(k=j;k<=strlen(line);k++) if(line[k] == ' ') break;
00688 strncpy(names[seqno],line+j,k-j);
00689 names[seqno][k-j]='\0';
00690 names[seqno][MAXNAMES]='\0';
00691
00692 for(i=k;i<=MAXLINE;i++) {
00693 c=line[i];
00694 if(c == '.' || c == '~' ) c = '-';
00695 if(c == '*') c = 'X';
00696 if(c == '\n' || c == EOS) break;
00697 if(isalpha(c) || c=='-') seq[len++]=c;
00698 }
00699
00700 for(i=0;;i++) {
00701 if(fgets(line,MAXLINE+1,fin)==NULL) break;
00702 if(blankline(line)) break;
00703 }
00704 }
00705 }
00706 strcpy(seq_array[seqno],seq);
00707 if(seqno==0) seqlength=len;
00708 }
00709 }
00710
00711 int countmsf(FILE *fin)
00712 {
00713
00714
00715 char line[MAXLINE+1];
00716 int lnseqs;
00717
00718 while (fgets(line,MAXLINE+1,fin) != NULL) {
00719 if(line[0]=='/' && line[1]=='/') break;
00720 }
00721
00722 while (fgets(line,MAXLINE+1,fin) != NULL) {
00723 if(!blankline(line)) break;
00724 }
00725 lnseqs = 1;
00726
00727 while (fgets(line,MAXLINE+1,fin) != NULL) {
00728 if(blankline(line)) return lnseqs;
00729 lnseqs++;
00730 }
00731
00732 return 0;
00733 }