Main Page   Class Hierarchy   Compound List   File List   Contact   Download   Symbolic Constraints   Examples  

readseq.c

00001 /* File: readseq.c
00002  * main() program for ureadseq.c, ureadseq.h
00003  *
00004  * Reads and writes nucleic/protein sequence in various
00005  * formats. Data files may have multiple sequences.
00006  *
00007  * Copyright 1990 by d.g.gilbert
00008  * biology dept., indiana university, bloomington, in 47405
00009  * e-mail: gilbertd@bio.indiana.edu
00010  *
00011  * This program may be freely copied and used by anyone.
00012  * Developers are encourged to incorporate parts in their
00013  * programs, rather than devise their own private sequence
00014  * format.
00015  *
00016  * This should compile and run with any ANSI C compiler.
00017  * Please advise me of any bugs, additions or corrections.
00018  *
00019  */
00020 
00021 const char *title
00022     = "readSeq (1Feb93), multi-format molbio sequence reader.\n";
00023 
00024  /*  History
00025   27 Feb 90.  1st release to public.
00026    4 Mar 90.  + Gary Olsen format
00027               + case change
00028               * minor corrections to NBRF,EMBL,others
00029               * output 1 file per sequence for gcg, unknown
00030               * define -DNOSTR for c-libraries w/o strstr
00031               - readseq.p, pascal version, becomes out-of-date
00032   24 May 90.  + Phylip 3.2 output format (no input)
00033   20 Jul 90.  + Phylip 3.3 output (no input yet)
00034               + interactive output re-direction
00035               + verbose progress info
00036               * interactive help output
00037               * dropped line no.s on NBRF output
00038               * patched in HyperGCG XCMD corrections,
00039                 - except for seq. documentation handling
00040               * dropped the IG special nuc codes, as IG has
00041                 adopted the standard IUB codes (now if only
00042                 everyone would adopt a standard format !)
00043   11 Oct 90.  * corrected bug in reading/writing of EMBL format
00044 
00045   17 Oct 91.  * corrected bug in reading Olsen format
00046                 (serious-deletion)
00047   10 Nov 91.  * corrected bug in reading some GCG format files
00048                 (serious-last line duplicated)
00049               + add format name parsing (-fgb, -ffasta, ...)
00050               + Phylip v3.4 output format (== v3.2, sequential)
00051               + add checksum output to all forms that have document
00052               + skip mail headers in seq file
00053               + add pipe for standard input == seq file (with -p)
00054               * fold in parts of MacApp Seq object
00055               * strengthen format detection
00056               * clarify program structure
00057               * remove fixed sequence size limit (now dynamic, sizeof memory)
00058               * check and fold in accumulated bug reports:
00059               *   Now ANSI-C fopen(..,"w") & check open failure
00060               *   Define -DFIXTOUPPER for nonANSI C libraries that mess
00061                   up toupper/tolower
00062               = No command-line changes; callers of readseq main() should be okay
00063               - ureadseq.h functions have changed; client programs need to note.
00064               + added Unix and VMS Make scripts, including validation tests
00065 
00066    4 May 92.  + added 32 bit CRC checksum as alternative to GCG 6.5bit checksum
00067                 (-DBIGCHECKSUM)
00068     Aug 92    = fixed Olsen format input to handle files w/ more sequences,
00069                 not to mess up when more than one seq has same identifier,
00070                 and to convert number masks to symbols.
00071               = IG format fix to understand ^L
00072 
00073   25-30 Dec 92
00074               * revised command-line & interactive interface.  Suggested form is now
00075                   readseq infile -format=genbank -output=outfile -item=1,3,4 ...
00076                 but remains compatible with prior commandlines:
00077                   readseq infile -f2 -ooutfile -i3 ...
00078               + added GCG MSF multi sequence file format
00079               + added PIR/CODATA format
00080               + added NCBI ASN.1 sequence file format
00081               + added Pretty, multi sequence pretty output (only)
00082               + added PAUP multi seq format
00083               + added degap option
00084               + added Gary Williams (GWW, G.Williams@CRC.AC.UK) reverse-complement option.
00085               + added support for reading Phylip formats (interleave & sequential)
00086               * string fixes, dropped need for compiler flags NOSTR, FIXTOUPPER, NEEDSTRCASECMP
00087               * changed 32bit checksum to default, -DSMALLCHECKSUM for GCG version
00088 
00089    1Feb93
00090               = revert GenBank output to a fixed left number width which 
00091                other software depends on.
00092               = fix for MSF input to handle symbols in names
00093               = fix bug for possible memory overrun when truncating seqs for
00094                 Phylip or Paup formats (thanks Anthony Persechini)
00095 
00096  */
00097 
00098 
00099 
00100 /*
00101    Readseq has been tested with:
00102       Macintosh MPW C
00103       GNU gcc
00104       SGI cc
00105       VAX-VMS cc
00106    Any ANSI C compiler should be able to handle this.
00107    Old-style C compilers barf all over the source.
00108 
00109 
00110 How do I build the readseq program if I have an Ansi C compiler?
00111 #--------------------
00112 # Unix ANSI C
00113 # Use the supplied Makefile this way:
00114 %  make CC=name-of-c-compiler
00115 # OR do this...
00116 % gcc readseq.c ureadseq.c -o readseq
00117 
00118 #--------------------
00119 $!VAX-VMS cc
00120 $! Use the supplied Make.Com this way:
00121 $  @make
00122 $! OR, do this:
00123 $ cc readseq, ureadseq
00124 $ link readseq, ureadseq, sys$library:vaxcrtl/lib
00125 $ readseq :== $ MyDisk:[myacct]readseq
00126 
00127 #--------------------
00128 # Macintosh Simple Input/Output Window application
00129 # requires MPW-C and SIOW library (from APDA)
00130 # also uses files macinit.c, macinit.r, readseqSIOW.make
00131 #
00132 Buildprogram readseqSIOW
00133 
00134 #--------------------
00135 #MPW-C v3 tool
00136 C  ureadseq.c
00137 C  readseq.c
00138 link -w -o readseq -t MPST -c 'MPS ' ¶
00139    readseq.c.o Ureadseq.c.o ¶
00140     "{Libraries}"Interface.o ¶
00141     "{Libraries}"ToolLibs.o ¶
00142     "{Libraries}"Runtime.o ¶
00143     "{CLibraries}"StdClib.o
00144 readseq -i1 ig.seq
00145 
00146 # MPW-C with NCBI tools
00147 
00148 set NCBI "{Boot}@molbio:ncbi:"; EXPORT NCBI
00149 set NCBILIB1  "{NCBI}"lib:libncbi.o; export NCBILIB1
00150 set NCBILIB2  "{NCBI}"lib:libncbiobj.o; export NCBILIB2
00151 set NCBILIB3  "{NCBI}"lib:libncbicdr.o; export NCBILIB3
00152 set NCBILIB4  "{NCBI}"lib:libvibrant.o; export NCBILIB4
00153 
00154 C  ureadseq.c
00155 C  -d NCBI -i "{NCBI}"include: ureadasn.c
00156 C  -d NCBI -i "{NCBI}"include: readseq.c
00157 link -w -o readseq -t MPST -c 'MPS ' ¶
00158    ureadseq.c.o ureadasn.c.o readseq.c.o  ¶
00159     {NCBILIB4} {NCBILIB2} {NCBILIB1} ¶
00160     "{Libraries}"Interface.o ¶
00161     "{Libraries}"ToolLibs.o ¶
00162     "{Libraries}"Runtime.o ¶
00163     "{CLibraries}"CSANELib.o ¶
00164     "{CLibraries}"Math.o ¶
00165     "{CLibraries}"StdClib.o
00166 
00167 ===========================================================*/
00168 
00169 
00170 
00171 #include <stdio.h>
00172 #include <string.h>
00173 #include <ctype.h>
00174 
00175 #include "ureadseq.h"
00176 
00177 #pragma segment readseq
00178 
00179 
00180 
00181 static char inputfilestore[256], *inputfile = inputfilestore;
00182 
00183 const char *formats[kMaxFormat+1] = {
00184     " 1. IG/Stanford",
00185     " 2. GenBank/GB",
00186     " 3. NBRF",
00187     " 4. EMBL",
00188     " 5. GCG",
00189     " 6. DNAStrider",
00190     " 7. Fitch",
00191     " 8. Pearson/Fasta",
00192     " 9. Zuker (in-only)",
00193     "10. Olsen (in-only)",
00194     "11. Phylip3.2",
00195     "12. Phylip",
00196     "13. Plain/Raw",
00197     "14. PIR/CODATA",
00198     "15. MSF",
00199     "16. ASN.1",
00200     "17. PAUP/NEXUS",
00201     "18. Pretty (out-only)",
00202     "" };
00203 
00204 #define kFormCount  30
00205 #define kMaxFormName 15
00206 
00207 const  struct formatTable {
00208   char  *name;
00209   short num;
00210   } formname[] = {
00211     {"ig",  kIG},
00212     {"stanford", kIG},
00213     {"genbank", kGenBank},
00214     {"gb", kGenBank},
00215     {"nbrf", kNBRF},
00216     {"embl", kEMBL},
00217     {"gcg", kGCG},
00218     {"uwgcg", kGCG},
00219     {"dnastrider", kStrider},
00220     {"strider", kStrider},
00221     {"fitch", kFitch},
00222     {"pearson", kPearson},
00223     {"fasta", kPearson},
00224     {"zuker", kZuker},
00225     {"olsen", kOlsen},
00226     {"phylip", kPhylip},
00227     {"phylip3.2", kPhylip2},
00228     {"phylip3.3", kPhylip3},
00229     {"phylip3.4", kPhylip4},
00230     {"phylip-interleaved", kPhylip4},
00231     {"phylip-sequential", kPhylip2},
00232     {"plain", kPlain},
00233     {"raw", kPlain},
00234     {"pir", kPIR},
00235     {"codata", kPIR},
00236     {"asn.1", kASN1},
00237     {"msf", kMSF},
00238     {"paup", kPAUP},
00239     {"nexus", kPAUP},
00240     {"pretty", kPretty},
00241   };
00242 
00243 const char *kASN1headline = "Bioseq-set ::= {\nseq-set {\n";
00244 
00245 /* GWW table for getting the complement of a nucleotide (IUB codes) */
00246 /*                     ! "#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[ \]^_`abcdefghijklmnopqrstuvwxyz{|}~ */
00247 const char compl[] = " !\"#$%&'()*+,-./0123456789:;<=>?@TVGHNNCDNNMNKNNYRYSAABWNRN[\\]^_`tvghnncdnnmnknnyrysaabwnrn{|}~";
00248 
00249 
00250 
00251 char *formatstr( short format)
00252 {
00253   if (format < 1 || format > kMaxFormat) {
00254     switch (format) {
00255       case kASNseqentry :
00256       case kASNseqset   : return formats[kASN1-1];
00257       case kPhylipInterleave:
00258       case kPhylipSequential: return formats[kPhylip-1];
00259       default: return "(unknown)";
00260       }
00261     }
00262   else return formats[format-1];
00263 }
00264 
00265 int parseformat( char *name)
00266 {
00267 #define kDupmatch  -2
00268   int   namelen, maxlen, i, match, matchat;
00269   char  lname[kMaxFormName+1];
00270 
00271   skipwhitespace(name);
00272   namelen = strlen(name);
00273   if (namelen == 0)
00274     return kNoformat;
00275   else if (isdigit(*name)) {
00276     i = atol( name);
00277     if (i < kMinFormat | i > kMaxFormat) return kNoformat;
00278     else return i;
00279     }
00280 
00281   /* else match character name */
00282   maxlen = min( kMaxFormName, namelen);
00283   for (i=0; i<maxlen; i++) lname[i] = to_lower(name[i]);
00284   lname[maxlen]=0;
00285   matchat = kNoformat;
00286 
00287   for (i=0; i<kFormCount; i++) {
00288     match = strncmp( lname, formname[i].name, maxlen);
00289     if (match == 0) {
00290       if (strlen(formname[i].name) == namelen) return (formname[i].num);
00291       else if (matchat == kNoformat) matchat = i;
00292       else matchat = kDupmatch; /* 2 or more partial matches */
00293       }
00294     }
00295   if (matchat == kNoformat || matchat == kDupmatch)
00296     return kNoformat;
00297   else
00298     return formname[matchat].num;
00299 }
00300 
00301 
00302 
00303 static void dumpSeqList(char *list, short format)
00304 {
00305   long i, l, listlen;
00306   char s[256];
00307 
00308   listlen = strlen(list);
00309   printf("Sequences in %s  (format is %s)\n", inputfile, formatstr(format));
00310   for (i=0, l=0; i < listlen; i++) {
00311     if (list[i] == (char)NEWLINE) {
00312       s[l] = '\0'; l = 0;
00313       puts(s);
00314       }
00315     else if (l < 255)
00316       s[l++] = list[i];
00317     }
00318   putchar('\n');
00319 }
00320 
00321 
00322 
00323 void usage()
00324 {
00325   short   i, midi;
00326 
00327   fprintf(stderr,title);
00328   fprintf(stderr,
00329   "usage: readseq [-options] in.seq > out.seq\n");
00330   fprintf(stderr," options\n");
00331 /* ? add -d[igits] to allow digits in sequence data, &/or option to specify seq charset !? */
00332   fprintf(stderr, "    -a[ll]         select All sequences\n");
00333   fprintf(stderr, "    -c[aselower]   change to lower case\n");
00334   fprintf(stderr, "    -C[ASEUPPER]   change to UPPER CASE\n");
00335   fprintf(stderr, "    -degap[=-]     remove gap symbols\n");
00336   fprintf(stderr, "    -i[tem=2,3,4]  select Item number(s) from several\n");
00337   fprintf(stderr, "    -l[ist]        List sequences only\n");
00338   fprintf(stderr, "    -o[utput=]out.seq  redirect Output\n");
00339   fprintf(stderr, "    -p[ipe]        Pipe (command line, <stdin, >stdout)\n");
00340   fprintf(stderr, "    -r[everse]     change to Reverse-complement\n");
00341   fprintf(stderr, "    -v[erbose]     Verbose progress\n");
00342   fprintf(stderr, "    -f[ormat=]#    Format number for output,  or\n");
00343   fprintf(stderr, "    -f[ormat=]Name Format name for output:\n");
00344   midi = (kMaxFormat+1) / 2;
00345   for (i = kMinFormat-1; i < midi; i++)
00346    fprintf( stderr, "        %-20s      %-20s\n",
00347     formats[i], formats[midi+i]);
00348 
00349   /* new output format options, esp. for pretty format: */
00350   fprintf(stderr, "     \n");
00351   fprintf(stderr, "   Pretty format options: \n");
00352   fprintf(stderr, "    -wid[th]=#            sequence line width\n");
00353   fprintf(stderr, "    -tab=#                left indent\n");
00354   fprintf(stderr, "    -col[space]=#         column space within sequence line on output\n");
00355   fprintf(stderr, "    -gap[count]           count gap chars in sequence numbers\n");
00356   fprintf(stderr, "    -nameleft, -nameright[=#]   name on left/right side [=max width]\n");
00357   fprintf(stderr, "    -nametop              name at top/bottom\n");
00358   fprintf(stderr, "    -numleft, -numright   seq index on left/right side\n");
00359   fprintf(stderr, "    -numtop, -numbot      index on top/bottom\n");
00360   fprintf(stderr, "    -match[=.]            use match base for 2..n species\n");
00361   fprintf(stderr, "    -inter[line=#]        blank line(s) between sequence blocks\n");
00362 
00363   /******  not ready yet
00364   fprintf(stderr, "    -code=none,rtf,postscript,ps   code syntax\n");
00365   fprintf(stderr, "    -namefont=, -numfont=, -seqfont=font   font choice\n");
00366   fprintf(stderr, "       font suggestions include times,courier,helvetica\n");
00367   fprintf(stderr, "    -namefontsize=, -numfontsize=, -seqfontsize=#\n");
00368   fprintf(stderr, "       fontsize suggestions include 9,10,12,14\n");
00369   fprintf(stderr, "    -namefontstyle=, -numfontstyle=, -seqfontstyle= style  fontstyle for names\n");
00370   fprintf(stderr, "       fontstyle options are plain,italic,bold,bold-italic\n");
00371   ******/
00372 }
00373 
00374 void erralert(short err)
00375 {
00376   switch (err) {
00377     case 0  :
00378       break;
00379     case eFileNotFound: fprintf(stderr, "File not found: %s\n", inputfile);
00380       break;
00381     case eFileCreate: fprintf(stderr, "Can't open output file.\n");
00382       break;
00383     case eASNerr: fprintf(stderr, "Error in ASN.1 sequence routines.\n");
00384       break;
00385     case eNoData: fprintf(stderr, "No data in file.\n");
00386       break;
00387     case eItemNotFound: fprintf(stderr, "Specified item not in file.\n");
00388       break;
00389     case eUnequalSize:  fprintf(stderr,
00390       "This format requires equal length sequences.\nSequence truncated or padded to fit.\n");
00391       break;
00392     case eUnknownFormat: fprintf(stderr, "Error: this format is unknown to me.\n");
00393       break;
00394     case eOneFormat: fprintf(stderr,
00395       "Warning: This format permits only 1 sequence per file.\n");
00396       break;
00397     case eMemFull: fprintf(stderr, "Out of storage memory. Sequence truncated.\n");
00398       break;
00399     default: fprintf(stderr, "readSeq error = %d\n", err);
00400       break;
00401     }
00402 } /* erralert */
00403 
00404 
00405 int chooseFormat( boolean quietly)
00406 {
00407   char  sform[128];
00408   int   midi, i, outform;
00409 
00410     if (quietly)
00411       return kPearson;  /* default */
00412     else {
00413       midi = (kMaxFormat+1) / 2;
00414       for (i = kMinFormat-1; i < midi; i++)
00415         fprintf( stderr, "        %-20s      %-20s\n",
00416                         formats[i], formats[midi+i]);
00417       fprintf(stderr,"\nChoose an output format (name or #): \n");
00418       gets(sform);
00419       outform = parseformat(sform);
00420       if (outform == kNoformat) outform = kPearson;
00421       return outform;
00422       }
00423 }
00424 
00425 
00426 
00427 /* read paramater(s) */
00428 
00429 boolean checkopt( boolean casesense, char *sopt, const char *smatch, short minword)
00430 {
00431   long  lenopt, lenmatch;
00432   boolean result;
00433   short minmaxw;
00434 
00435   lenopt = strlen(sopt);
00436   lenmatch= strlen(smatch);
00437   minmaxw= max(minword, min(lenopt, lenmatch));
00438 
00439   if (casesense)
00440     result= (!strncmp( sopt, smatch, minmaxw));
00441   else
00442     result= (!Strncasecmp( sopt, smatch, minmaxw ));
00443   /* if (result) { */
00444     /* fprintf(stderr,"true checkopt(opt=%s,match=%s,param=%s)\n", sopt, smatch, *sparam); */
00445   /*  } */
00446   return result;
00447 }
00448 
00449 
00450 #define   kMaxwhichlist  50
00451 
00452 /* global for readopt(), main() */
00453 boolean   chooseall = false, quietly = false, gotinputfile = false,
00454           listonly = false, closeout = false, verbose = false,
00455           manyout = false, dolower = false, doupper = false, doreverse= false,
00456           askout  = true, dopipe= false, interleaved = false;
00457 short     nfile = 0, iwhichlist=0, nwhichlist = 0;
00458 short     whichlist[kMaxwhichlist+1];
00459 long      whichSeq = 0, outform = kNoformat;
00460 char      onamestore[128], *oname = onamestore;
00461 FILE      *foo = NULL;
00462 
00463 void resetGlobals()
00464 /* need this when used from SIOW, as these globals are not reinited automatically
00465 between calls to local main() */
00466 {
00467   chooseall = false; quietly = false; gotinputfile = false;
00468   listonly = false; closeout = false; verbose = false;
00469   manyout = false; dolower = false; doupper = false; doreverse= false;
00470   askout  = true; dopipe= false; interleaved = false;
00471   nfile = 0; iwhichlist=0; nwhichlist = 0;
00472   whichSeq = 0; outform = kNoformat;
00473   oname = onamestore;
00474   foo = NULL;
00475 
00476   gPrettyInit(gPretty);
00477 }
00478 
00479 
00480 #define kOptOkay  1
00481 #define kOptNone  0
00482 
00483 int readopt( char *sopt)
00484 {
00485   char    sparamstore[256], *sparam= sparamstore;
00486   short   n, slen= strlen(sopt);
00487 
00488   /* fprintf(stderr,"readopt( %s) == ", sopt); */
00489 
00490   if (*sopt == '?') {
00491     usage();
00492     return kOptNone;   /*? eOptionBad or kOptNone */
00493     }
00494 
00495   else if (*sopt == '-') {
00496 
00497     char *cp= strchr(sopt,'=');
00498     *sparam= '\0';
00499     if (cp) {
00500       strcpy(sparam, cp+1);
00501       *cp= 0;
00502       }
00503 
00504     if (checkopt( false, sopt, "-help", 2)) {
00505       usage();
00506       return kOptNone;
00507       }
00508 
00509     if (checkopt( false, sopt, "-all", 2)) {
00510       whichSeq= 1; chooseall= true;
00511       return kOptOkay;
00512       }
00513 
00514     if (checkopt( false, sopt, "-colspace", 4)) { /* test before -c[ase] */
00515       n= atoi( sparam);
00516       gPretty.spacer = n;
00517       return kOptOkay;
00518       }
00519 
00520     if (checkopt( true, sopt, "-caselower", 2)) {
00521       dolower= true;
00522       return kOptOkay;
00523       }
00524     if (checkopt( true, sopt, "-CASEUPPER", 2)) {
00525       doupper= true;
00526       return kOptOkay;
00527       }
00528 
00529     if (checkopt( false, sopt, "-pipe", 2)) {
00530       dopipe= true; askout= false;
00531       return kOptOkay;
00532       }
00533 
00534     if (checkopt( false, sopt, "-list", 2)) {
00535       listonly = true; askout = false;
00536       return kOptOkay;
00537       }
00538 
00539     if (checkopt( false, sopt, "-reverse", 2)) {
00540       doreverse = true;
00541       return kOptOkay;
00542       }
00543 
00544     if (checkopt( false, sopt, "-verbose", 2)) {
00545       verbose = true;
00546       return kOptOkay;
00547       }
00548 
00549     if (checkopt( false, sopt, "-match", 5)) {
00550       gPretty.domatch= true;
00551       if (*sparam >= ' ') gPretty.matchchar= *sparam;
00552       return kOptOkay;
00553       }
00554     if (checkopt( false, sopt, "-degap", 4)) {
00555       gPretty.degap= true;
00556       if (*sparam >= ' ') gPretty.gapchar= *sparam;
00557       return kOptOkay;
00558       }
00559 
00560     if (checkopt( false, sopt, "-interline", 4)) {
00561       gPretty.interline= atoi( sparam);
00562       return kOptOkay;
00563       }
00564 
00565     if (checkopt( false, sopt, "-item", 2)) {
00566       char  *cp = sparam;
00567       nwhichlist= 0;
00568       whichlist[0]= 0;
00569       if (*cp == 0) cp= sopt+2; /* compatible w/ old way */
00570       do {
00571         while (*cp!=0 && !isdigit(*cp)) cp++;
00572         if (*cp!=0) {
00573           n = atoi( cp);
00574           whichlist[nwhichlist++]= n;
00575           while (*cp!=0 && isdigit(*cp)) cp++;
00576           }
00577       } while (*cp!=0 && n>0 && nwhichlist<kMaxwhichlist);
00578       whichlist[nwhichlist++]= 0; /* 0 == stopsign for loop */
00579       whichSeq= max(1,whichlist[0]); iwhichlist= 1;
00580       return kOptOkay;
00581       }
00582 
00583     if (checkopt( false, sopt, "-format", 5)) {/* -format=phylip, -f2, -form=phylip */
00584       if (*sparam==0) { for (sparam= sopt+2; isalpha(*sparam); sparam++) ; }
00585       outform = parseformat( sparam);
00586       return kOptOkay;
00587       }
00588     if (checkopt( false, sopt, "-f", 2)) { /* compatible w/ -fphylip prior version */
00589       if (*sparam==0) sparam= sopt+2;
00590       outform = parseformat( sparam);
00591       return kOptOkay;
00592       }
00593 
00594     if (checkopt( false, sopt, "-output", 3)) {/* -output=myseq */
00595       if (*sparam==0) { for (sparam= sopt+3; isalpha(*sparam); sparam++) ; }
00596       strcpy( oname, sparam);
00597       foo = fopen( oname, "w");
00598       if (!foo) { erralert(eFileCreate); return eFileCreate; }
00599       closeout = true;
00600       askout = false;
00601       return kOptOkay;
00602       }
00603     if (checkopt( false, sopt, "-o", 2)) {  /* compatible w/ -omyseq prior version */
00604       if (*sparam==0) sparam= sopt+2;
00605       strcpy( oname, sparam);
00606       foo = fopen( oname, "w");
00607       if (!foo) { erralert(eFileCreate); return eFileCreate; }
00608       closeout = true;
00609       askout = false;
00610       return kOptOkay;
00611       }
00612 
00613     if (checkopt( false, sopt, "-width", 2)) {
00614       if (*sparam==0) { for (sparam= sopt+2; !isdigit(*sparam) && *sparam!=0; sparam++) ; }
00615       n= atoi( sparam);
00616       if (n>0) gPretty.seqwidth = n;
00617       return kOptOkay;
00618       }
00619 
00620     if (checkopt( false, sopt, "-tab", 4)) {
00621       if (*sparam==0) { for (sparam= sopt+2; !isdigit(*sparam) && *sparam!=0; sparam++) ; }
00622       n= atoi( sparam);
00623       gPretty.tab = n;
00624       return kOptOkay;
00625       }
00626 
00627     if (checkopt( false, sopt, "-gapcount", 4)) {
00628       gPretty.baseonlynum = false;
00629       /* if (*sparam >= ' ') gPretty.gapchar= *sparam; */
00630       return kOptOkay;
00631       }
00632     if (checkopt( false, sopt, "-nointerleave", 8)) {
00633       gPretty.noleaves = true;
00634       return kOptOkay;
00635       }
00636 
00637     if (checkopt( false, sopt, "-nameleft", 7)) {
00638       if (*sparam==0) { for (sparam= sopt+2; !isdigit(*sparam) && *sparam!=0; sparam++) ; }
00639       n= atoi( sparam);
00640       if (n>0 && n<50) gPretty.namewidth =  n;
00641       gPretty.nameleft= true;
00642       return kOptOkay;
00643       }
00644     if (checkopt( false, sopt, "-nameright", 7)) {
00645       if (*sparam==0) { for (sparam= sopt+2; !isdigit(*sparam) && *sparam!=0; sparam++) ; }
00646       n= atoi( sparam);
00647       if (n>0 && n<50) gPretty.namewidth =  n;
00648       gPretty.nameright= true;
00649       return kOptOkay;
00650       }
00651     if (checkopt( false, sopt, "-nametop", 6)) {
00652       gPretty.nametop= true;
00653       return kOptOkay;
00654       }
00655 
00656     if (checkopt( false, sopt, "-numleft", 6)) {
00657       if (*sparam==0) { for (sparam= sopt+2; !isdigit(*sparam) && *sparam!=0; sparam++) ; }
00658       n= atoi( sparam);
00659       if (n>0 && n<50) gPretty.numwidth =  n;
00660       gPretty.numleft= true;
00661       return kOptOkay;
00662       }
00663     if (checkopt( false, sopt, "-numright", 6)) {
00664       if (*sparam==0) { for (sparam= sopt+2; !isdigit(*sparam) && *sparam!=0; sparam++) ; }
00665       n= atoi( sparam);
00666       if (n>0 && n<50) gPretty.numwidth =  n;
00667       gPretty.numright= true;
00668       return kOptOkay;
00669       }
00670 
00671     if (checkopt( false, sopt, "-numtop", 6)) {
00672       gPretty.numtop= true;
00673       return kOptOkay;
00674       }
00675     if (checkopt( false, sopt, "-numbottom", 6)) {
00676       gPretty.numbot= true;
00677       return kOptOkay;
00678       }
00679 
00680     else {
00681       usage();
00682       return eOptionBad;
00683       }
00684     }
00685 
00686   else {
00687     strcpy( inputfile, sopt);
00688     gotinputfile = (*inputfile != 0);
00689     nfile++;
00690     return kOptOkay;
00691     }
00692 
00693  /* return kOptNone; -- never here */
00694 }
00695 
00696 
00697 
00698 
00699 /* this program suffers some as it tries to be a quiet translator pipe
00700    _and_ a noisy user interactor
00701 */
00702 
00703 /* return is best for SIOW, okay for others */
00704 #ifdef SIOW
00705 #define Exit(a)   return(a)
00706 siow_main( int argc, char *argv[])
00707 
00708 #else
00709 #define Exit(a)   exit(a)
00710 
00711 main( int argc, char *argv[])
00712 #endif
00713 {
00714 boolean   closein = false;
00715 short     ifile, nseq, atseq, format, err = 0, seqtype = kDNA,
00716           nlines, seqout = 0, phylvers = 2;
00717 long      i, skiplines, seqlen, seqlen0;
00718 unsigned long  checksum= 0, checkall= 0;
00719 char      *seq, *cp, *firstseq = NULL, *seqlist, *progname, tempname[256];
00720 char      seqid[256], *seqidptr = seqid;
00721 char      stempstore[256], *stemp = stempstore;
00722 FILE      *ftmp, *fin, *fout;
00723 long      outindexmax= 0, noutindex= 0, *outindex = NULL;
00724 
00725 #define exit_main(err) {        \
00726   if (closeout) fclose(fout);   \
00727   if (closein) fclose(fin);   \
00728   if (*tempname!=0) remove(tempname);\
00729   Exit(err); }
00730 
00731 #define indexout()  if (interleaved) {\
00732   if (noutindex>=outindexmax) {\
00733     outindexmax= noutindex + 20;\
00734     outindex= (long*) realloc(outindex, sizeof(long)*outindexmax);\
00735     if (outindex==NULL) { err= eMemFull; erralert(err); exit_main(err); }\
00736     }\
00737   outindex[noutindex++]= ftell(fout);\
00738   }
00739 
00740 
00741   resetGlobals();
00742   foo = stdout;
00743   progname = argv[0];
00744   *oname = 0;
00745   *tempname = 0;
00746   /* initialize gPretty ?? -- done in header */
00747 
00748   for (i=1; i < argc; i++) {
00749     err= readopt( argv[i]);
00750     if (err <= 0) exit_main(err);
00751     }
00752 
00753                             /* pipe input from stdin !? */
00754   if (dopipe && !gotinputfile) {
00755     int c;
00756     tmpnam(tempname);
00757     inputfile = tempname;
00758     ftmp = fopen( inputfile, "w");
00759     if (!ftmp) { erralert(eFileCreate); exit_main(eFileCreate); }
00760     while ((c = getc(stdin)) != EOF) fputc(c, ftmp);
00761     fclose(ftmp);
00762     gotinputfile= true;
00763     }
00764 
00765   quietly = (dopipe || (gotinputfile && (listonly || whichSeq != 0)));
00766 
00767   if (verbose || (!quietly && !gotinputfile)) fprintf( stderr, title);
00768   ifile = 1;
00769 
00770                             /* UI: Choose output */
00771   if (askout && !closeout && !quietly) {
00772     askout = false;
00773     fprintf(stderr,"\nName of output file (?=help, defaults to display): \n");
00774     gets(oname= onamestore);
00775     skipwhitespace(oname);
00776     if (*oname == '?') { usage(); exit_main(0); }
00777     else if (*oname != 0) {
00778       closeout = true;
00779       foo = fopen( oname, "w");
00780       if (!foo) { erralert(eFileCreate); exit_main(eFileCreate); }
00781       }
00782     }
00783 
00784   fout = foo;
00785   if (outform == kNoformat) outform = chooseFormat(quietly);
00786 
00787                           /* set up formats ... */
00788   switch (outform) {
00789     case kPhylip2:
00790       interleaved= false;
00791       phylvers = 2;
00792       outform = kPhylip;
00793       break;
00794 
00795     case kPhylip4:
00796       interleaved= true;
00797       phylvers = 4;
00798       outform = kPhylip;
00799       break;
00800 
00801     case kMSF:
00802     case kPAUP:
00803       interleaved= true;
00804       break;
00805 
00806     case kPretty:
00807       gPretty.isactive= true;
00808       interleaved= true;
00809       break;
00810 
00811     }
00812 
00813   if (gPretty.isactive && gPretty.noleaves) interleaved= false;
00814   if (interleaved) {
00815     fout = ftmp = tmpfile();
00816     outindexmax= 30; noutindex= 0;
00817     outindex = (long*) malloc(outindexmax*sizeof(long));
00818     if (outindex==NULL) { err= eMemFull; erralert(err); exit_main(err); }
00819     }
00820 
00821                         /* big loop over all input files */
00822   do {
00823                         /* select next input file */
00824     gotinputfile = (*tempname != 0);
00825     while ((ifile < argc) && (!gotinputfile)) {
00826       if (*argv[ifile] != '-') {
00827         strcpy( inputfile, argv[ifile]);
00828         gotinputfile = (*inputfile != 0);
00829         --nfile;
00830         }
00831       ifile++;
00832       }
00833 
00834     while (!gotinputfile) {
00835       fprintf(stderr,"\nName an input sequence or -option: \n");
00836       inputfile= inputfilestore;
00837 
00838       gets(stemp= stempstore);
00839       if (*stemp==0) goto fini;  /* !! need this to finish work during interactive use */
00840       stemp= strtok(stempstore, " \n\r\t");
00841       while (stemp) {
00842         err= readopt( stemp); /* will read inputfile if it exists */
00843         if (err<0) exit_main(err);
00844         stemp= strtok( NULL, " \n\r\t");
00845         }
00846       }
00847               /* thanks to AJB@UK.AC.DARESBURY.DLVH for this PHYLIP3 fix: */
00848               /* head for end (interleave if needed) */
00849     if (*inputfile == 0) break;
00850 
00851     format = seqFileFormat( inputfile, &skiplines, &err);
00852 
00853     if (err == 0)  {
00854 #ifdef NCBI
00855       if (format == kASNseqentry || format == kASNseqset)
00856         seqlist = listASNSeqs( inputfile, skiplines, format, &nseq, &err);
00857       else
00858 #endif
00859         seqlist = listSeqs( inputfile, skiplines, format, &nseq, &err);
00860       }
00861 
00862     if (err != 0)
00863       erralert(err);
00864 
00865     else if (listonly) {
00866       dumpSeqList(seqlist,format);
00867       free( seqlist);
00868       }
00869 
00870     else {
00871                                 /* choose whichSeq if needed */
00872       if (nseq == 1 || chooseall || (quietly && whichSeq == 0)) {
00873         chooseall= true;
00874         whichSeq = 1;
00875         quietly = true; /* no loop */
00876         }
00877       else if (whichSeq > nseq && quietly) {
00878         erralert(eItemNotFound);
00879         err= eItemNotFound;
00880         }
00881       else if (whichSeq > nseq || !quietly) {
00882         dumpSeqList(seqlist, format);
00883         fprintf(stderr,"\nChoose a sequence (# or All): \n");
00884         gets(stemp= stempstore);
00885         skipwhitespace(stemp);
00886         if (to_lower(*stemp) == 'a') {
00887           chooseall= true;
00888           whichSeq = 1;
00889           quietly = true; /* !? this means we don't ask for another file 
00890                             as well as no more whichSeqs... */
00891           }
00892         else if (isdigit(*stemp)) whichSeq= atol(stemp);
00893         else whichSeq= 1; /* default */
00894         }
00895       free( seqlist);
00896 
00897       if (false /*chooseall*/) {  /* this isn't debugged yet...*/
00898         fin = fopen(inputfile, "r");
00899         closein= true;
00900         }
00901 
00902       while (whichSeq > 0 && whichSeq <= nseq) {
00903                                 /* need to open multiple output files ? */
00904         manyout = ((chooseall || nwhichlist>1) && nseq > 1
00905                   && (outform == kPlain || outform == kGCG));
00906         if (manyout) {
00907           if ( whichSeq == 1 ) erralert(eOneFormat);
00908           else if (closeout) {
00909             sprintf( stemp,"%s_%d", oname, whichSeq);
00910             freopen( stemp, "w", fout);
00911             fprintf( stderr,"Writing sequence %d to file %s\n", whichSeq, stemp);
00912             }
00913           }
00914 
00915         if (closein) {
00916           /* !! this fails... skips most seqs... */
00917           /* !! in sequential read, must count seqs already read from whichSeq ... */
00918           /* need major revision of ureadseq before we can do this */
00919           atseq= whichSeq-1;
00920           seqidptr= seqid;
00921           seq = readSeqFp( whichSeq, fin, skiplines, format,
00922                           &seqlen, &atseq, &err, seqidptr);
00923           skiplines= 0;
00924           }
00925         else {
00926           atseq= 0;
00927           seqidptr= seqid;
00928 #ifdef NCBI
00929           if (format == kASNseqentry || format == kASNseqset) {
00930             seqidptr= NULL;
00931             seq = readASNSeq( whichSeq, inputfile, skiplines, format,
00932                      &seqlen, &atseq, &err, &seqidptr);
00933             }
00934           else
00935 #endif
00936           seq = readSeq( whichSeq, inputfile, skiplines, format,
00937                           &seqlen, &atseq, &err, seqidptr);
00938           }
00939 
00940 
00941         if (gPretty.degap) {
00942           char *newseq;
00943           long newlen;
00944           newseq= compressSeq( gPretty.gapchar, seq, seqlen, &newlen);
00945           if (newseq) {
00946             free(seq); seq= newseq; seqlen= newlen;
00947             }
00948           }
00949 
00950         if (outform == kMSF) checksum= GCGchecksum(seq, seqlen, &checkall);
00951         else if (verbose) checksum= seqchecksum(seq, seqlen, &checkall);
00952         if (verbose)
00953           fprintf( stderr, "Sequence %d, length= %d, checksum= %X, format= %s, id= %s\n",
00954                 whichSeq, seqlen, checksum, formatstr(format), seqidptr);
00955 
00956         if (err != 0) erralert(err);
00957         else {
00958                                   /* format fixes that writeseq doesn't do */
00959           switch (outform) {
00960             case kPIR:
00961               if (seqout == 0) fprintf( foo,"\\\\\\\n");
00962               break;
00963             case kASN1:
00964               if (seqout == 0) fprintf( foo, kASN1headline);
00965               break;
00966 
00967             case kPhylip:
00968               if (seqout == 0) {
00969                 if (!interleaved) {  /*  bug, nseq is for 1st infile only */
00970                   if (chooseall) i= nseq; else i=1;
00971                   if (phylvers >= 4) fprintf(foo," %d %d\n", i, seqlen);
00972                   else fprintf(foo," %d %d YF\n", i, seqlen);
00973                   }
00974                 seqlen0 = seqlen;
00975                 }
00976               else if (seqlen != seqlen0) {
00977                 erralert(eUnequalSize);
00978                 if (seqlen < seqlen0) seq = (char *)realloc(seq, seqlen0);
00979                 for (i=seqlen; i<seqlen0; i++) seq[i]= gPretty.gapchar;
00980                 seqlen = seqlen0;
00981                 seq[seqlen] = 0; 
00982                 }
00983               break;
00984 
00985             case kPAUP:
00986               if (seqout == 0) {
00987                 seqtype= getseqtype(seq, seqlen);
00988                 seqlen0 = seqlen;
00989                 }
00990               else if (seqlen != seqlen0) {
00991                 erralert(eUnequalSize);
00992                 if (seqlen < seqlen0) seq = (char *)realloc(seq, seqlen0); 
00993                 for (i=seqlen; i<seqlen0; i++) seq[i]= gPretty.gapchar;
00994                 seqlen = seqlen0;
00995                 seq[seqlen] = 0; 
00996                 }
00997               break;
00998 
00999             }
01000 
01001           if (doupper)
01002             for (i = 0; i<seqlen; i++) seq[i] = to_upper(seq[i]);
01003           else if (dolower)
01004             for (i = 0; i<seqlen; i++) seq[i] = to_lower(seq[i]);
01005 
01006           if (doreverse) {
01007             long  j, k;
01008             char  ctemp;
01009             for (j=0, k=seqlen-1; j <= k; j++, k--) {
01010               ctemp = compl[seq[j] - ' '];
01011               seq[j] = compl[seq[k] - ' '];
01012               seq[k] = ctemp;
01013               }
01014             }
01015 
01016           if ((gPretty.isactive || outform==kPAUP) && gPretty.domatch && firstseq != NULL) {
01017             for (i=0; i<seqlen; i++)
01018               if (seq[i]==firstseq[i]) seq[i]= gPretty.matchchar;
01019             }
01020 
01021 
01022           if (gPretty.isactive && gPretty.numtop && seqout == 0) {
01023             gPretty.numline = 1;
01024             indexout();
01025             (void) writeSeq( fout, seq, seqlen, outform, seqidptr);
01026             gPretty.numline = 2;
01027             indexout();
01028             (void) writeSeq( fout, seq, seqlen, outform, seqidptr);
01029             gPretty.numline = 0;
01030             }
01031 
01032           indexout();
01033           nlines = writeSeq( fout, seq, seqlen, outform, seqidptr);
01034           seqout++;
01035           }
01036 
01037         if ((gPretty.isactive || outform==kPAUP) && gPretty.domatch && firstseq == NULL) {
01038           firstseq= seq;
01039           seq = NULL;
01040           }
01041         else if (seq!=NULL) { free(seq); seq = NULL; }
01042 
01043 #ifdef NCBI
01044        if ( (format == kASNseqentry || format == kASNseqset)
01045           && seqidptr && seqidptr!= seqid)
01046             free(seqidptr);
01047 #endif
01048         if (chooseall) whichSeq++;
01049         else if (iwhichlist<nwhichlist) whichSeq= whichlist[iwhichlist++];
01050         else whichSeq= 0;
01051         }
01052       if (closein) { fclose(fin); closein= false; }
01053       }
01054     whichSeq  = 0;
01055   } while (nfile > 0 || !quietly);
01056 
01057 
01058 fini:
01059   if (firstseq) { free(firstseq); firstseq= NULL; }
01060   if (err || listonly) exit_main(err);
01061 
01062   if (gPretty.isactive && gPretty.numbot) {
01063     gPretty.numline = 2;
01064     indexout();
01065     (void) writeSeq( fout, seq, seqlen, outform, seqidptr);
01066     gPretty.numline = 1;
01067     indexout();
01068     (void) writeSeq( fout, seq, seqlen, outform, seqidptr);
01069     gPretty.numline = 0;
01070     }
01071 
01072   if (outform == kMSF) {
01073     if (*oname) cp= oname; else cp= inputfile;
01074     fprintf(foo,"\n %s  MSF: %d  Type: N  January 01, 1776  12:00  Check: %d ..\n\n",
01075                   cp, seqlen, checkall);
01076     }
01077 
01078   if (outform == kPAUP) {
01079     fprintf(foo,"#NEXUS\n");
01080     if (*oname) cp= oname; else cp= inputfile;
01081     fprintf(foo,"[%s -- data title]\n\n", cp);
01082     /* ! now have header lines for each sequence... put them before "begin data;... */
01083     }
01084 
01085   if (outform==kPhylip && interleaved) {
01086     if (phylvers >= 4) fprintf(foo," %d %d\n", seqout, seqlen);
01087     else fprintf(foo," %d %d YF\n", seqout, seqlen);
01088     }
01089 
01090   if (interleaved) {
01091     /* interleave species lines in true output */
01092     /* nlines is # lines / sequence */
01093     short iline, j, leaf, iseq;
01094     char  *s = stempstore;
01095 
01096     indexout();  noutindex--; /* mark eof */
01097 
01098     for (leaf=0; leaf<nlines; leaf++) {
01099       if (outform == kMSF && leaf == 1) {
01100         fputs("//\n\n", foo);
01101         }
01102       if (outform == kPAUP && leaf==1) {
01103         switch (seqtype) {
01104           case kDNA     : cp= "dna"; break;
01105           case kRNA     : cp= "rna"; break;
01106           case kNucleic : cp= "dna"; break;
01107           case kAmino   : cp= "protein"; break;
01108           case kOtherSeq: cp= "dna"; break;
01109           }
01110         fprintf(foo,"\nbegin data;\n");
01111         fprintf(foo," dimensions ntax=%d nchar=%d;\n", seqout, seqlen);
01112         fprintf(foo," format datatype=%s interleave missing=%c", cp, gPretty.gapchar);
01113         if (gPretty.domatch) fprintf(foo," matchchar=%c", gPretty.matchchar);
01114         fprintf(foo,";\n  matrix\n");
01115         }
01116 
01117       for (iseq=0; iseq<noutindex; iseq++) {
01118         fseek(ftmp, outindex[iseq], 0);
01119         for (iline=0; iline<=leaf; iline++)
01120           if (!fgets(s, 256, ftmp)) *s= 0;
01121         if (ftell(ftmp) <= outindex[iseq+1])
01122           fputs( s, foo);
01123         }
01124 
01125       for (j=0; j<gPretty.interline; j++)
01126         fputs( "\n", foo);  /* some want spacer line */
01127       }
01128     fclose(ftmp); /* tmp disappears */
01129     fout= foo;
01130     }
01131 
01132   if (outform == kASN1)  fprintf( foo, "} }\n");
01133   if (outform == kPAUP)  fprintf( foo,";\n  end;\n");
01134 
01135   if (outindex != NULL) free(outindex);
01136   exit_main(0);
01137 }
01138 
01139 

Generated on Tue Nov 16 15:18:18 2004 for SCIL by doxygen1.2.16