/* A MULTIPLE ALIGNMENT PROGRAM for syntenic genomic sequences (MAP2): Copyright (c) 2005 Iowa State University Research Foundation, Inc. All Rights Reserved Liang Ye and Xiaoqiu Huang Department of Computer Science Iowa State University 226 Atanasoff Hall Ames, IA 50011-1040, USA Permission to use, copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the IOWA STATE copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to incorporate this software into commercial products may be obtained from Iowa State University Research Foundation, Inc. IN NO EVENT SHALL THE AUTHOR OR IOWA STATE UNIVERSITY BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF IOWA STATE UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. IOWA STATE UNIVERSITY SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND IOWA STATE UNIVERSITY HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. Last update time 3/15/2004 The program MAP2 is developed based on the algorithm of MAP (Huang, 1994) and GAP3 (Huang and Chao, 2002). MAP2 computes a global alignment from homologous sequences from different species. The program takes as input a file of sequences in fasta format, for example, seq.fasta. Similar regions are reported in an output file. To align a set of sequences in file seq.fasta, use a command of form map2 seq.fasta bpenalty ms q r > result where map2 is the name of the object code, bpenalty is a positive integer specifying the major difference. ms is a negative integer specifying mismatch weight, q and r are non-negative integers specifying gap-open and gap-extend penalties,respectively. Output alignments are saved in the file "result". An example command is map2 seq.fasta 250 -12 38 3 > result The results are in several files, including the standard output, seqfilename.info, seqfilename.flat. 1 standard output The standard output can be directed to an output file, say seqfilename.out. The .out file shows alignments of similar regions in plain text. MAP2 builds alignments in the order of a binary tree based on pairwise sequence similarity scores. For each node in the tree, an alignment of similar regions for the corresponding subset of sequences is reported along with a simple summary including the number of similar regions, the total alignment length of the similar regions, and the portion of the alignment with a column consensus percent identity bigger than 75%. 2 .flat This output file shows alignments in fasta format for each node in the tree. Each difference section in the alignment is makred by '#'. The length information for each '#' is stored in the .info file. 3 .info There are three parts in this output file: (a) the parameter values used in MAP2, (b) the order of pairwise scores used, and (c) the length information for each difference section marked by '#'. REFERENCE: Ye, L, Huang, X. MAP2: multiple alignment of syntenic genomic sequences. Nucleic Acids Res. 2005 33: 162-170 */ #include #include #define NAMELEN 14 /* can't exceed 14 */ #define MATCHSC 10 /* match score */ static int numrows = 30; /*max number of rows in the basis of diff()*/ static int neginf = -10000; /* negative infinity */ static double consencf = 0.75; /* the cutoff of computing consensus identity */ static int match, mismh; /* max and min substitution weights */ static double SCORE; /* score of a best alignment */ static int v[128][128]; /* substitution scores */ static int q, r; /* gap penalties */ static int qr; /* qr = q + r */ static int pay; /* Penalty for a major difference */ typedef struct OVERLAP /* of 5' and 3' segments */ { int id1; /* id of sequence 1 */ int id2; /* id of sequence 2 */ double score; /* score of overlap alignment */ struct OVERLAP *next; } over, *overptr; struct SEG { char *name; /* name of sequence */ int len; /* length of ssequence name */ char *seq; /* sequence */ int length; /* length of sequence */ overptr list; /* list of overlapping edges */ } *segment; /* array of sequence records */ int seg_num; /* The number of segments */ int maxlen; /* maximum of sequence lengths */ overptr *edge; /* set of overlapping edges */ int edge_num; /* The number of overlaps */ struct ALG { char *row[2]; /* one row of multiple alignment */ int len[2]; /* maximum length of row */ int flag; /* row[flag] is being used; flag = 0 or 1 */ int next; /* id of next sequence in alignment */ int head; /* id of the first sequence in alignment */ int num; /* number of sequences in this group if it is head */ int length; /* length of alignment */ int pos; /* current position in layout */ char **rect; /* pointer to 2-D alignment matrix */ } *group; /* sequence groups */ main(argc, argv) int argc; char *argv[]; { int M; /* Sequence length */ int ms; /* User-supplied weights */ FILE *Ap, *Sp, *Dp, *Fp, *ckopen(); char *ckalloc(); /* space-allocating function */ char alph[129], *s; /* alphabet */ int size; /* size of alphabet */ int total; /* Total of sequence lengths */ int number; /* The number of sequences */ char *sequence; /* Storing all sequences */ int symbol, prev; /* The next character */ int i, j; /* index variables */ short heading; /* 1: heading; 0: sequence */ char *oname; /* info file includes ordered pairwise scores and all length info for '#' */ char *fname; /* .flat file for all the outputs with '#' standing for one different block */ if ( argc != 6 ) { fprintf(stderr,"Usage: %s Seq major_diff mismatch gap_open gap_extend\n\n", argv[0]); fprintf(stderr,"Seq file of multiple sequences in FASTA format\n"); fprintf(stderr,"major_diff gap length for a constant gap penalty, a positive integer\n"); fprintf(stderr,"mismatch a negative integer for DNA or PAM250/BLOSUM62 for protein\n"); fprintf(stderr,"gap_open gap open penalty, a non-negative integer \n"); fprintf(stderr,"gap_extend gap extension penalty, a positive integer \n"); exit(1); } /* determine number of sequences and total lengths */ j = maxlen = 0; Ap = ckopen(argv[1], "r"); prev = '\n'; for (total = 3, number = 0; ( symbol = getc(Ap)) != EOF ; total++ ) { if ( symbol == '>' && prev == '\n' ) number++; prev = symbol; } if ( number == 0 ) fatal("There are no sequences or sequences are in wrong format"); total += number * 20; /* allocate space for sequences */ sequence = ( char * ) ckalloc( total * sizeof(char)); segment = ( struct SEG * ) ckalloc( number * sizeof(struct SEG)); /* read the sequences into sequence */ M = 0; Ap = ckopen(argv[1], "r"); number = -1; heading = 0; prev = '\n'; for ( i = 0; ( symbol = getc(Ap)) != EOF ; ) { if ( symbol != '\n' && ( heading || ! isspace(symbol) ) ) sequence[++i] = symbol; if ( symbol == '>' && prev == '\n' ) { heading = 1; if ( number >= 0 ) { segment[number].length = i - j - 1; if ( maxlen < i - j - 1 ) maxlen = i - j - 1; if ( i - j - 1 > M ) M = i - j -1; } number++; j = i; segment[number].name = &(sequence[i]); segment[number].list = NULL; } if ( heading && symbol == '\n' ) { heading = 0; segment[number].len = i - j; segment[number].seq = &(sequence[i]); j = i; } prev = symbol; } segment[number].length = i - j; if ( maxlen < i - j ) maxlen = i - j; if ( i - j > M ) M = i - j; seg_num = ++number; (void) fclose(Ap); edge_num = 0; (void) sscanf(argv[2],"%d", &pay); if ( pay < 1 ) fatal("Penalty for a major difference is a positive integer"); (void) sscanf(argv[argc-2],"%d", &q); if ( q < 0 ) fatal("The gap-open penalty is a nonnegative integer"); (void) sscanf(argv[argc-1],"%d", &r); if ( r < 0 ) fatal("The gap-extend penalty is a nonnegative integer"); qr = q + r; /* check if the argument represents a negative integer */ s = argv[argc-3]; if ( *s == '-' ) s++; for ( ; *s >= '0' && *s <= '9' ; s++ ); if ( *s == '\0' ) { (void) sscanf(argv[argc-3],"%d", &ms); if ( ms >= 0 ) fatal("The mismatch weight is a negative integer"); match = MATCHSC; mismh = ms; /* set match and mismatch weights */ for ( i = 0; i < 128 ; i++ ) for ( j = 0; j < 128 ; j++ ) if (i == j ) v[i][j] = match; else v[i][j] = mismh; v['N']['N'] = mismh; v['n']['n'] = mismh; v['A']['a'] = v['a']['A'] = match; v['C']['c'] = v['c']['C'] = match; v['G']['g'] = v['g']['G'] = match; v['T']['t'] = v['t']['T'] = match; } else { /* read a file containing alphabet and substitution weights */ Sp = ckopen(argv[argc-3], "r"); (void) fscanf(Sp, "%s", alph); size = strlen(alph); match = mismh = 0; for ( i = 0; i < 128 ; i++ ) for ( j = 0; j < 128 ; j++ ) v[i][j] = 0; for ( i = 0; i < size ; i++ ) for ( j = 0; j <= i ; j++ ) { (void) fscanf(Sp, "%d", &ms); v[alph[i]][alph[j]] = v[alph[j]][alph[i]] = ms; if ( ms > match ) match = ms; if ( ms < mismh ) mismh = ms; } } for ( i = 0; i < 128 ; i++ ) v['-'][i] = v[i]['-'] = - r; v['-']['-'] = 0; i = strlen(argv[1]) + 20; oname = (char*) ckalloc(i * sizeof(char)); sprintf(oname, "%s.info", argv[1]); fname = (char*) ckalloc(i * sizeof(char)); sprintf(fname, "%s.flat", argv[1]); Dp = ckopen(oname, "w"); Fp = ckopen(fname, "w"); Pairwise(total); Multiple(Dp, Fp); (void) fclose(Dp); (void) fclose(Fp); } static int *CC, *RR; /* forward and reverse matrix score */ static int *DD, *SS; /* forward and reverse deletion score */ static int *S; /* saving operations for diff, negative, zero, or positive numbers */ static int *HH; /* for long deletion gaps in forward pass from above direction*/ static int *XX; /* for long deletion gaps in reverse pass from above direction*/ static short *MD; /* a list of flags for major differences, 0 or 1 */ struct TraceType { char c, d, e, h; }; /* a traceback record */ static struct TraceType **tracem; /* traceback information */ typedef struct SYMBOL /* the structure to save length information for each # */ { int len; /* the base length for each # */ struct SYMBOL *next; } sym, *symptr; static symptr *list; /* maintain one list for each sequence to update the information for all # in the sequence */ static symptr dlist; /* save all the deleted nodes for later use */ /* The following definitions are for function diff() */ int diff(); #define gap(k) ((k) <= 0 ? 0 : q+r*(k)) /* k-symbol indel score */ static int *sapp; /* Current script append ptr */ static int last; /* Last script op appended */ static short *mdptr; /* pointer to current major difference flag */ static short premd; /* last major difference flag */ static int no_match; /* estimate for the length of the similar region */ /* Append "Delete k" op */ #define DEL(k, md) \ { \ if ( last < 0 && premd == md ) \ last = sapp[-1] -= (k); \ else \ { last = *sapp++ = -(k); \ premd = *mdptr++ = md; \ } \ } /* Append "Insert k" op */ #define INS(k, md) \ { \ if ( last > 0 && premd == md ) \ last = sapp[-1] += (k); \ else \ { last = *sapp++ = (k); \ premd = *mdptr++ = md; \ } \ } /* Append "Replace" op */ #define REP \ { last = *sapp++ = 0; \ premd = *mdptr++ = 0; \ } /* Perform pair-wise comparisons of sequences. */ Pairwise(total) int total; /* total sequence length */ { int i, j; /* row and column indices */ char *A, *B; /* pointers to sequences */ int M, N; /* sequence lengths */ overptr node1; /* pointer to overlap */ symptr node2; char *ckalloc(); /* space-allocating function */ /* allocate space for all vectors */ j = (total + 1) * sizeof(int); CC = ( int * ) ckalloc(j); DD = ( int * ) ckalloc(j); RR = ( int * ) ckalloc(j); SS = ( int * ) ckalloc(j); HH = ( int * ) ckalloc(j); XX = ( int * ) ckalloc(j); S = ( int * ) ckalloc(2 * j); MD = ( short * ) ckalloc( j * sizeof(short) ); i = (numrows + 1) * sizeof(struct TraceType *); tracem = ( struct TraceType ** ) ckalloc(i); j = j * sizeof(struct TraceType); for ( i = 0; i <= numrows; i++ ) tracem[i] = ( struct TraceType * ) ckalloc(j); list = (symptr *) ckalloc( seg_num * sizeof(symptr)); for (i = 0; i < seg_num; i++) /*add dummy head for each list*/ { node2 = (symptr) ckalloc(sizeof(sym)); node2->len = 0; node2->next = NULL; list[i] = node2; } for ( i = 0; i < seg_num - 1 ; i++ ) { A = segment[i].seq; M = segment[i].length; for ( j = i+1; j < seg_num ; j++ ) { B = segment[j].seq; N = segment[j].length; node1 = ( overptr ) ckalloc( (int ) sizeof(over)); SCORE = - ( 2 * q + (M + N) * r + 1000); if ( M <= N ) { big_pass(A,B,M,N); node1->id1 = i; node1->id2 = j; node1->score = SCORE; node1->next = segment[i].list; segment[i].list = node1; } else { big_pass(B,A,N,M); node1->id1 = j; node1->id2 = i; node1->score = SCORE; node1->next = segment[j].list; segment[j].list = node1; } edge_num++; } } } /* find best overlap score between two sequences */ big_pass(A,B,M,N) char A[],B[]; int M,N; { register int i, j; /* row and column indices */ register int c; /* best score at current point */ register int e; /* best score ending with insertion */ register int d; /* best score ending with deletion */ register int h; /* best score ending with different block */ register int s, s1; /* temporary variables */ register int g; /* best score ending with different block */ int y, temp, flag, x, alignlen = 1; /* temporary variables */ int *va; /* pointer to v(A[i], B[j]) */ /* Compute the matrix. CC : the scores of the current row RR : the average alignment length without considering different blocks DD : the scores of the current row, ending with deletion */ /* Initialize the 0th row, starting or ending gap is not penalized */ g = -pay; CC[0] = c = -pay; RR[0] = 0; for (j = 1; j <= N; j++) { RR[j] = 0; CC[j] = c; DD[j] = c - q; HH[j] = g; } for (i = 1; i <= M; i++) { s = c = CC[0]; s1 = RR[0]; e = s - q; h = g; va = v[A[i]]; for (j = 1; j <= N; j++) { flag = 0; if ((temp = c - qr) > (e = e - r)) e = temp; if ( c < (temp = CC[j] ) ) {c = temp; flag = 1;} if ( (c = c - pay) < (y = HH[j]) ) { c = y; flag = 1;} if ( h < c ) h = c; else flag = 0; if ((c = temp - qr) > (d = DD[j] - r)) d = c; c = s+va[B[j]]; x = s1 + 2; if ( c < d ) { c = d; x = RR[j] + 1; } if ( c < e ) { c = e; x = RR[j-1] + 1; } if ( c < h ) { c = h; if (!flag) x = RR[j-1]; else x = RR[j]; } s = temp; s1 = RR[j]; RR[j] = x; CC[j] = c; DD[j] = d; HH[j] = h; if ((j == N || i == M) && c > SCORE) { SCORE = c; alignlen = RR[j]; } } } SCORE = 2*SCORE/alignlen; } /* Construct mutiple alignments */ Multiple(Dp, Fp) FILE *Dp, *Fp; { char *ckalloc(); /* space-allocating function */ int i, j, k, t; /* index variables */ overptr node1; /* temporary pointer */ int sorted; /* boolean variable */ char *a, *b; /* temporary pointers */ int head1, head2; /* ids of first sequences in alignments */ struct ALG *pa, *pb; /* pointers to group elements */ char *tname; /* All the variables below for order info file */ int len; char name[NAMELEN+3]; group = ( struct ALG * ) ckalloc( seg_num * sizeof(struct ALG)); for ( i = 0; i < seg_num; i++ ) { group[i].row[0] = ( char * ) ckalloc( 2 * maxlen * sizeof(char)); group[i].row[1] = ( char * ) ckalloc( 2 * maxlen * sizeof(char)); group[i].len[0] = group[i].len[1] = 2 * maxlen; group[i].flag = 1; group[i].next = -1; group[i].head = i; group[i].num = 1; group[i].length = k = segment[i].length; group[i].pos = 0; group[i].rect = ( char ** ) ckalloc( sizeof(char *)); group[i].rect[0] = a = group[i].row[1]; a[0] = ' '; for ( b = segment[i].seq, j = 1; j <= k; j++ ) a[j] = b[j]; a[j] = ' '; } /* sort all the overlap scores */ edge = ( overptr * ) ckalloc( edge_num * sizeof(overptr) ); for ( j = 0, i = 0; i < seg_num; i++ ) for ( node1 = segment[i].list; node1 != NULL; node1 = node1->next ) edge[j++] = node1; edge_num = j; for ( i = edge_num - 1; i > 0; i-- ) { sorted = 1; for ( j = 0; j < i; j++ ) if ( edge[j]->score < edge[j+1]->score ) { node1 = edge[j]; edge[j] = edge[j+1]; edge[j+1] = node1; sorted = 0; } if ( sorted ) break; } fprintf(Dp, "Max_Match Major_Block_Penalty Min_Mismatch Gap_Open_Penalty Gap_Extension_Penalty\n"); fprintf(Dp, " %d %d %d %d %d\n\n", match, pay, mismh, q, r); fprintf(Dp, "\nThe order of pairwise scores:\n\n"); fprintf(Dp, "id1\t\tid1_len\t\tid2\t\tid2_len\t\toverlap score\n"); for ( k = 0; k < edge_num; k++ ) { len = segment[edge[k]->id1].len; tname = segment[edge[k]->id1].name + 1; for (j = 0; j < len && j < NAMELEN; j++) name[j] = *tname++; name[j] = '\0'; (void) fprintf(Dp, "%s\t\t%d", name, segment[edge[k]->id1].length); len = segment[edge[k]->id2].len; tname = segment[edge[k]->id2].name + 1; for (j = 0; j < len && j < NAMELEN; j++) name[j] = *tname++; name[j] = '\0'; (void) fprintf(Dp, "\t\t%s\t\t%d\t\t", name, segment[edge[k]->id2].length); fprintf(Dp, "%f\n", edge[k]->score); } fprintf(Dp, "\n\nAlignment in a flat format:"); for ( k = 0; k < edge_num; k++ ) { head1 = group[edge[k]->id1].head; head2 = group[edge[k]->id2].head; if ( head1 != head2 ) { if ( group[head1].length > group[head2].length ) { t = head1; head1 = head2; head2 = t; } pa = &group[head1]; pb = &group[head2]; sapp = S; last = 0; mdptr = MD; premd = 1; no_match = 0; diff(pa->rect,pb->rect,pa->length,pb->length,pa->num,pb->num,0,0,-pay,-pay-q,-pay,-pay,-pay-q,-pay); if (no_match < 5) continue; Merge(head1, head2, S, MD); Show(head1); FlatFormat(head1, Dp, Fp); } } } /* Merge two sequence alignment according to script S and MD */ Merge(head1, head2, S, MD) int head1, head2; /* ids of first sequences in two alignments */ int S[]; /* script for alignment*/ short MD[]; /* major difference flags */ { char **rect1, **rect2; /* pointers to two input alignments */ char **rect; /* pointer to resulting alignment */ int size1, size2; /* number of sequences in two alignments */ int size; /* number of sequences in resulting alignment */ int limit; /* maximum length of resulting alignment */ int tail; /* id of last sequence in alignment 1 */ int i, j, h, k, x, y; /* index variables */ int M, N; /* lengths of alignments */ int op; /* current script operation */ char *ckalloc(); /* space-allocating function */ symptr Getnode(); /* function for getting a new node */ int flag; /* index of row */ short cmd, precmd; /* current major difference flag and the previos flag */ int len2; /* temporary variable to calculate base length in each # */ int *base; /* array to update the base length in each # for each sequence */ symptr *pre; /* current pointer for the position in the list */ symptr node2, nodep, lptr; /* temporary node */ int symnum; /* the number of '#' in a different block*/ char ch; size1 = group[head1].num; size2 = group[head2].num; size = size1 + size2; rect = ( char ** ) ckalloc( size * sizeof(char *)); limit = 2 + group[head1].length + group[head2].length; base = (int *) ckalloc( size * sizeof(int)); pre = (symptr *) ckalloc( size * sizeof(symptr)); for ( h = 0, k = head1; k != -1; tail = k, k = group[k].next ) { pre[h] = list[k]; /*each time pre point to the first element*/ group[k].flag = flag = 1 - group[k].flag; if ( group[k].len[flag] < limit ) { group[k].row[flag] = (char *) ckalloc(2 * limit * sizeof(char)); group[k].len[flag] = 2 * limit; } rect[h++] = group[k].row[flag]; } group[tail].next = head2; for ( k = head2; k != -1; k = group[k].next ) { pre[h] = list[k]; group[k].flag = flag = 1 - group[k].flag; group[k].head = head1; if ( group[k].len[flag] < limit ) { group[k].row[flag] = (char *) ckalloc(2 * limit * sizeof(char)); group[k].len[flag] = 2 * limit; } rect[h++] = group[k].row[flag]; } for ( h = 0; h < size; h++ ) rect[h][0] = ' '; rect1 = group[head1].rect; rect2 = group[head2].rect; k = 1; M = group[head1].length; N = group[head2].length; op = 0; precmd = 0; i = 0; j = 0; while (i < M || j < N) { if (op == 0 && *S == 0) { op = *S++; cmd = *MD++; for ( ++i, h = 0; h < size1; h++ ) rect[h][k] = rect1[h][i]; for ( ++j, h = size1; h < size; h++ ) rect[h][k] = rect2[h-size1][j]; } else { if (op == 0) { op = *S++; cmd = *MD++; } if (op > 0) { if ( cmd ) { symnum = 0; if (size2 > 1) /* check the number of '#' in this region*/ { for (x = size1; x < size; x++) base[x] = 0; for (x = 1; x <= op; x++) { if (rect2[0][j+x] == '#') symnum++; for ( y = 0; y < size2; y++) if ((ch = rect2[y][j+x]) != ' '&&ch != '-'&&ch != '#') base[size1+y] += 1; } } else { base[size1] = op; } if (!precmd) /* if precmd is 0 */ { for ( h = 0; h < size; h++ ) rect[h][k] = '#'; /* for sequences in rect1, new nodes with length 0*/ for ( h = 0; h < size1; h++ ) { node2 = Getnode(0); node2->next = pre[h]->next; pre[h]->next = node2; pre[h] = node2; } if (symnum == 0) /*produce new node*/ for ( h = size1; h < size; h++ ) { node2 = Getnode(base[h]); node2->next = pre[h]->next; pre[h]->next = node2; pre[h] = node2; } else /*symnum > 0*/ for ( h = size1; h < size; h++ ) { pre[h] = pre[h]->next; lptr = pre[h]; len2 = lptr->len; for (x = 1; x < symnum; x++) { if(lptr->next != NULL) { lptr = lptr->next; len2 += lptr->len; } else { fatal("symnum = %d is wrong in op>0", symnum);} } pre[h]->len = len2 + base[h]; if ( symnum > 1) /*delete some nodes and save in list*/ { nodep = dlist; dlist = pre[h]->next; pre[h]->next = lptr->next; lptr->next = nodep; } } } else /*precmd == 1*/ for ( h = size1; h < size; h++ ) { lptr = pre[h]; len2 = lptr->len; for (x = 0; x < symnum; x++) { lptr = lptr->next; len2 += lptr->len; } pre[h]->len = len2 + base[h]; if (symnum > 0) { nodep = dlist; dlist = pre[h]->next; pre[h]->next = lptr->next; lptr->next = nodep; } } j += op; op = 0; } else /* cmd == 0*/ { for ( h = 0; h < size1; h++ ) if (rect1[h][i] == ' ' || rect1[h][i+1] == ' ') rect[h][k] = ' '; else rect[h][k] = '-'; for ( ++j, h = size1; h < size; h++ ) rect[h][k] = rect2[h-size1][j]; op--; } } else /* for op < 0 */ { if ( cmd ) { symnum = 0; if (size1 > 1) /* check the number of '#' in this region*/ { for (x = 0; x < size1; x++) base[x] = 0; for (x = 1; x <= -op; x++) { if (rect1[0][i+x] == '#') symnum++; for (y = 0; y < size1; y++) if ((ch = rect1[y][i+x]) != ' '&&ch != '-'&&ch != '#') base[y] += 1; } } else { base[0] = -op; } if (!precmd) /* if precmd is 0 */ { for ( h = 0; h < size; h++ ) rect[h][k] = '#'; for ( h = size1; h < size; h++ ) { node2 = Getnode(0); node2->next = pre[h]->next; pre[h]->next = node2; pre[h] = node2; } if (symnum == 0) /*produce new node*/ for ( h = 0; h < size1; h++ ) { node2 = Getnode(base[h]); node2->next = pre[h]->next; pre[h]->next = node2; pre[h] = node2; } else /*symnum > 0*/ for ( h = 0; h < size1; h++ ) { pre[h] = pre[h]->next; lptr = pre[h]; len2 = lptr->len; for (x = 1; x < symnum; x++) { lptr = lptr->next; len2 += lptr->len; } pre[h]->len = len2 + base[h]; if (symnum > 1) { nodep = dlist; dlist = pre[h]->next; pre[h]->next = lptr->next; lptr->next = nodep; } } } else /*precmd == 1*/ for ( h = 0; h < size1; h++ ) { lptr = pre[h]; len2 = lptr->len; for (x = 0; x < symnum; x++) { if(lptr->next != NULL) { lptr = lptr->next; len2 += lptr->len; } else { fatal("symnum = %d is wrong in op<0", symnum);} } pre[h]->len = len2 + base[h]; if (symnum > 0) { nodep = dlist; dlist = pre[h]->next; pre[h]->next = lptr->next; lptr->next = nodep; } } i += (-op); op = 0; } else /* cmd == 0*/ { for ( ++i, h = 0; h < size1; h++ ) rect[h][k] = rect1[h][i]; for ( h = size1; h < size; h++ ) if (rect2[h-size1][j] == ' ' || rect2[h-size1][j+1] == ' ') rect[h][k] = ' '; else rect[h][k] = '-'; op++; } } } if (!precmd || !cmd) k++; precmd = cmd; } for ( h = 0; h < size; h++ ) rect[h][k] = ' '; group[head1].num = size; group[head1].length = k - 1; group[head1].rect = rect; } /* diff2(A,B,M,N,U,W,mm,nn,c00,d00,h00,cmn,dmn,hmn) returns the score of an optimum conversion between A[0..U-1][mm+1..mm+M] and B[0..W-1][nn+1..nn+N] such that the upper left and lower right corners are initialized to the given values: C(0,0) = c00, D(0,0) = d00, H(0,0) = h00, C(m,n) = cmn, D(m,n) = dmn, H(m,n) = hmn. */ int diff(A,B,M,N,U,W,mm,nn,c00, d00, h00, cmn, dmn, hmn) char *A[], *B[]; int M, N; int U, W, mm, nn, c00, d00, h00, cmn, dmn, hmn; { int midi, midj, type; /* Midpoint, type, and cost */ int midc; int cmid, dmid, hmid; /* C, D & H values at midpoint */ { register int i, j; register int c, e, d, s, h; int t, *va; int g, x, y, tmp, tmp1; int p, k; int tt; struct TraceType *trow; /* one row of tracem */ struct TraceType *ten; /* pointer to one entry of tracem */ char letc; /* a letter to indicate matrix type */ /* Boundary cases: M <= numrows */ if ( M <= numrows ) { trow = tracem[M]; ten = trow + N; RR[N] = c = cmn; ten->c = 'c'; e = c - q; ten->e = 'c'; if ( dmn > e ) { t = dmn; ten->d = 'd'; } else { t = e; ten->d = 'c'; } if ( hmn > ( y = c - pay ) ) { g = hmn; ten->h = 'h'; } else { g = y; ten->h = 'c'; } for (j = N-1; j >= 0; j--) { ten = trow + j; if (B[0][nn+j+1] == '#') { XX[j] = g; RR[j] = c = XX[j]; SS[j] = neginf; e = neginf; ten->c = 'h'; ten->h = 'g'; continue; } if (e == neginf) { e = c - qr; ten->e ='c'; } else { e -= r; ten->e = 'e'; } if ( e > g ) { RR[j] = c = e; ten->c = 'e'; } else { RR[j] = c = g; ten->c = 'h'; } SS[j] = c + neginf; ten->d = 'c'; XX[j] = g; ten->h = 'g'; } for (i = M-1; i >= 0; i--) { trow = tracem[i]; ten = trow + N; if (A[0][mm+i+1] == '#' ) { h = g; /*new g value if there is a '#'*/ RR[N] = c = h; t = neginf; /*new t value if there is a '#'*/ ten->c = 'h'; ten->h = 'h'; for(j= N-1; j >= 0; j--) { ten = trow + j; if ( c < (tmp = RR[j] ) ) { c = tmp; ten->h = 'x'; } else ten->h = 'y'; if ( (c = c - pay) < (y = XX[j]) ) { c = y; ten->h = 'h'; } if ( h < c ) h = c; else ten->h = 'g'; XX[j] = h; RR[j] = c = XX[j]; SS[j] = neginf; ten->c = 'h'; } continue; } s = RR[N]; if (t == neginf) { t = s - qr; ten->d = 'c';} else { t -= r; ten->d = 'd';} if ( t > g ) { RR[N] = c = t; ten->c = 'd'; } else { RR[N] = c = g; ten->c = 'h'; } e = c + neginf; ten->e = 'c'; h = g; ten->h = 'h'; for ( x = p = 0; p < U; p++ ) if ( A[p][mm+i+1] != ' ' ) x += 1; for (j = N-1; j >= 0; j--) { ten = trow + j; tmp1 = c - qr; if ( c < (tmp = RR[j] ) ) { c = tmp; ten->h = 'x'; } else ten->h = 'y'; if ( (c = c - pay) < (y = XX[j]) ) { c = y; ten->h = 'h'; } if ( h < c ) h = c; else ten->h = 'g'; if (B[0][nn+j+1] == '#') { XX[j] = h; s = RR[j]; RR[j] = c = h; SS[j] = neginf; e = neginf; ten->c = 'h'; continue; } ten->e = 'e'; if ( e == neginf || tmp1 > (e = e - r)) { e = tmp1; ten->e = 'c'; } c = tmp - qr; ten->d = 'd'; if ( SS[j] == neginf || c > (d = SS[j] - r) ) { d = c; ten->d = 'c'; } c = 0; ten->c = 'c'; for ( p = 0; p < U; p++ ) if ( ( tt = A[p][mm+i+1] ) != ' ' ) { va = v[tt]; for ( k = 0; k < W; k++ ) if ( ( tt = B[k][nn+j+1] ) != ' ' ) c += va[tt]; } for ( y = k = 0; k < W; k++ ) if ( B[k][nn+j+1] != ' ' ) y += 1; c = c / (x * y) + s; if ( c < d ) { c = d; ten->c = 'd'; } if ( c < e ) { c = e; ten->c = 'e'; } if ( c < h ) { c = h; ten->c = 'h'; } s = tmp; RR[j] = c; SS[j] = d; XX[j] = h; } } SS[N] = t; XX[N] = g; midc = RR[0] + c00; letc = 'c'; if ( midc < (tmp = XX[0] + h00 + pay) ) { midc = tmp; letc = 'h'; } if ( midc < (tmp = SS[0] + d00 + q) ) { midc = tmp; letc = 'd'; } i = j = 0; while ( i < M || j < N ) { switch ( letc ) { case 'c' : if ( (letc = tracem[i][j].c ) == 'c' ) { if ( i >= M || j >= N ) fatal("Row/column index exceeds the limit"); i++; j++; REP no_match++; } break; case 'd' : letc = tracem[i][j].d; if ( i >= M ) fatal("D: row index exceeds the limit"); i++; DEL(1, 0) break; case 'e' : letc = tracem[i][j].e; if ( j >= N ) fatal("E: column index exceeds the limit"); j++; INS(1, 0) break; case 'h' : letc = tracem[i][j].h; if ( letc == 'h' || letc == 'x' ) { if ( i >= M ) fatal("Row index exceeds the limit"); i++; DEL(1, 1) if ( letc == 'x' ) letc = 'c'; } else { if ( j >= N ) fatal("Column index exceeds the limit"); j++; INS(1, 1) if ( letc == 'g' ) letc = 'h'; else if ( letc == 'y' ) letc = 'c'; else fatal("Wrong traceback letter"); } break; default : printf("Letter %c\n", letc); fatal("Wrong traceback letter"); } } return midc; } /* Divide: Find optimum midpoint (midi,midj) of cost midc */ midi = M/2; /* Forward phase */ CC[0] = c = c00; /* Compute C(M/2,k), D(M/2,k) & H(M/2,k) for all k */ e = c00 - q; g = h00 > ( y = c00 - pay ) ? h00 : y; for (j = 1; j <= N; j++) { if (B[0][nn+j] == '#') { HH[j] = g; CC[j] = c = HH[j]; DD[j] = neginf; e = neginf; continue; } if (e == neginf) e = c - qr; else e -= r; CC[j] = c = e > g ? e : g; DD[j] = c - q; HH[j] = g; } t = d00 > (y = c00 - q) ? d00 : y; for (i = 1; i <= midi; i++) { if (A[0][mm+i] == '#' ) { CC[0] = h = c = g; t = neginf; for(j= 1; j <= N; j++) { if ( c < (tmp = CC[j] ) ) c = tmp; if ( (c = c - pay) < (y = HH[j]) ) c = y; if ( h < c ) h = c; HH[j] = h; CC[j] = c = h; DD[j] = neginf; } continue; } s = CC[0]; if (t != neginf) t -= r; else t = s - qr; CC[0] = c = t > g ? t : g; e = c - q; h = g; for ( x = p = 0; p < U; p++ ) if ( A[p][mm+i] != ' ' ) x += 1; for (j = 1; j <= N; j++) { tmp1 = c - qr; if ( c < (tmp = CC[j] ) ) c = tmp; if ( (c = c - pay) < (y = HH[j]) ) c = y; if ( h < c ) h = c; if (B[0][nn+j] == '#') { HH[j] = h; s = CC[j]; CC[j] = c = HH[j]; DD[j] = neginf; e = neginf; continue; } if ( e == neginf || tmp1 > (e = e - r)) e = tmp1; c = tmp - qr; if ( DD[j] == neginf || c > (d = DD[j] - r)) d = c; c = 0; for ( p = 0; p < U; p++ ) if ( ( tt = A[p][mm+i] ) != ' ' ) { va = v[tt]; for ( k = 0; k < W; k++ ) if ( ( tt = B[k][nn+j] ) != ' ' ) c += va[tt]; } for ( y = k = 0; k < W; k++ ) if ( B[k][nn+j] != ' ' ) y += 1; c = c / (x * y) + s; if ( c < d ) c = d; if ( c < e ) c = e; if ( c < h ) c = h; s = tmp; CC[j] = c; DD[j] = d; HH[j] = h; } } DD[0] = t; HH[0] = g; RR[N] = c = cmn; /* Reverse phase */ e = cmn - q; /* Compute R(M/2,k), S(M/2,k) & X(M/2,k) for all k */ g = hmn > ( y = cmn - pay ) ? hmn : y; for (j = N-1; j >= 0; j--) { if (B[0][nn+j+1] == '#') { XX[j] = g; RR[j] = c = XX[j]; SS[j] = neginf; e = neginf; continue; } if (e == neginf) e = c - qr; else e -= r; RR[j] = c = e > g ? e : g; SS[j] = c - q; XX[j] = g; } t = dmn > (y = cmn - q) ? dmn : y; for (i = M-1; i >= midi; i--) { if (A[0][mm+i+1] == '#' ) { RR[N] = h = c = g; t = neginf; for(j= N-1; j >= 0; j--) { if ( c < (tmp = RR[j] ) ) c = tmp; if ( (c = c - pay) < (y = XX[j]) ) c = y; if ( h < c ) h = c; XX[j] = h; RR[j] = c = h; SS[j] = neginf; } continue; } s = RR[N]; if (t != neginf) t -= r; else t = s - qr; RR[N] = c = t > g ? t : g; e = c - q; h = g; for ( x = p = 0; p < U; p++ ) if ( A[p][mm+i+1] != ' ' ) x += 1; for (j = N-1; j >= 0; j--) { tmp1 = c - qr; if ( c < (tmp = RR[j] ) ) c = tmp; if ( (c = c - pay) < (y = XX[j]) ) c = y; if ( h < c ) h = c; if (B[0][nn+j+1] == '#') { XX[j] = h; s = RR[j]; RR[j] = c = XX[j]; SS[j] = neginf; e = neginf; continue; } if ( e == neginf || tmp1 > (e = e - r)) e = tmp1; c = tmp - qr; if ( SS[j] == neginf || c > (d = SS[j] - r)) d = c; c = 0; for ( p = 0; p < U; p++ ) if ( ( tt = A[p][mm+i+1] ) != ' ' ) { va = v[tt]; for ( k = 0; k < W; k++ ) if ( ( tt = B[k][nn+j+1] ) != ' ' ) c += va[tt]; } for ( y = k = 0; k < W; k++ ) if ( B[k][nn+j+1] != ' ' ) y += 1; c = c / (x * y) + s; if ( c < d ) c = d; if ( c < e ) c = e; if ( c < h ) c = h; s = tmp; RR[j] = c; SS[j] = d; XX[j] = h; } } SS[N] = t; XX[N] = g; midc = HH[N]+XX[N]+pay; /* Find optimal midpoint */ midj = N; type = 3; for ( j = N; j >= 0; j-- ) { if ( (c = HH[j] + XX[j] + pay) > midc ) { midc = c; midj = j; type = 3; } if ((c = DD[j] + SS[j] + q) > midc) { midc = c; midj = j; type = 2; } if ( (c = CC[j] + RR[j]) > midc ) { midc = c; midj = j; type = 1; } } } /* Conquer: recursively around midpoint */ cmid = CC[midj]; dmid = DD[midj]; hmid = HH[midj]; (void) diff(A,B,midi,midj,U,W,mm,nn,c00,d00,h00,RR[midj],SS[midj],XX[midj]); (void) diff(A,B,M-midi,N-midj,U,W,mm+midi,nn+midj,cmid,dmid,hmid,cmn,dmn,hmn); return midc; } /* Alignment display routine */ /* This function needs the present list and the head of the group just combined*/ Show(head) int head; { int i, j, k, h, n; int length, size; /* the length and size of the alignment in rect */ int len; char name[NAMELEN+3]; char *t; char **rect; char a; int line; char stg[200]; int blank; int num; /* the number of similar regions */ symptr *pre; char *ckalloc(); /* space-allocating function */ int simlen, consenlen, anum, cnum, gnum, tnum; double tmp; (void) printf("\n\n %d sequences in the alignments: \n", group[head].num); if ((length = group[head].length) < 5) { printf("\tNo similar region is detected!\n\n"); return; } pre = (symptr *) ckalloc( seg_num * sizeof(symptr)); for ( j = head; j != -1; j = group[j].next) { pre[j] = list[j]->next; group[j].pos = 0; } rect = group[head].rect; size = group[head].num; line = 0; num = 0; simlen = consenlen = anum = cnum = gnum = tnum = 0; if (rect[0][1] != '#') { num = 1; (void) printf("\n Similar region %d : \n", num); } for ( i = 1; i <= length; ) { if (rect[0][i] == '#') { for ( j = head; j != -1; j = group[j].next) { group[j].pos += pre[j]->len; pre[j] = pre[j]->next; line = 0; } if (i != length) (void) printf("\n\n Similar region %d : \n", ++num); i++; continue; } (void) printf(" "); for ( j = 0; j < 60; j += 10 ) (void) printf(" . :"); line += 60; (void) printf("%7d\n", line); /* compute the length of consensus identity */ for ( k = i; k <= i+59 && k <= length && rect[0][k] != '#'; k++ ) { simlen++; for ( h = 0; h < size; h++ ) { if ((a = rect[h][k]) == 'A' || a == 'a') anum++; if (a == 'C' || a == 'c') cnum++; if (a == 'G' || a == 'g') gnum++; if (a == 'T' || a == 't') tnum++; } if (anum >= (tmp = size * consencf) || cnum >= tmp || gnum >= tmp || tnum >= tmp) consenlen++; anum = cnum = gnum = tnum = 0; } /* display the alignments of rect and positions information */ for ( j = head, h = 0; j != -1; j = group[j].next, h++ ) { len = segment[j].len; t = segment[j].name + 1; for ( k = 0; k < len && k < NAMELEN; k++ ) name[k] = *t++; name[k] = '\0'; t = stg; blank = 1; (void) sprintf(t,"%-15s", name); t += 15; for ( k = i; k <= i+59 && k <= length && rect[h][k] != '#'; k++ ) { (void) sprintf(t++,"%c", (a = rect[h][k]) ); if ( a != ' ' && a != '-' ) group[j].pos++; if ( a != ' ') blank = 0; } for ( n = k; n <= i+59; n++ ) (void) sprintf(t++,"%c", ' '); (void) sprintf(t,"%7d\n\0", group[j].pos); if ( ! blank ) (void) printf("%s", stg); } i = k; (void) printf("\n"); } (void) printf("\nThe total number of similar regions: %d", num); (void) printf("\nThe total alignment length of all similar regions: %d", simlen); (void) printf("\nThe total length with consensus column identity bigger than %5.2f : %d", consencf, consenlen); (void) printf("\nThe portion of length with identity bigger than %5.2f : %5.2f\n\n", consencf, (double)consenlen/simlen); } /* Display output in a flat format */ FlatFormat(head, Dp, Fp) int head; FILE *Dp, *Fp; { int i, j, k, h; int length; int len; char name[NAMELEN+3]; char *t; char **rect; symptr ptr; char a; char stg[200]; rect = group[head].rect; length = group[head].length; fprintf(Dp, "\n\n%d sequences in the alignment\n\n", group[head].num); fprintf(Fp, "\n\n%d sequences in the alignment\n\n", group[head].num); for ( j = head, h = 0; j != -1; j = group[j].next, h++ ) { len = segment[j].len; t = segment[j].name + 1; for ( k = 0; k < len && k < NAMELEN; k++ ) name[k] = *t++; name[k] = '\0'; fprintf(Fp, ">%s\n", name); fprintf(Dp, ">%s\n", name); for (ptr = list[j]->next;ptr != NULL; ptr = ptr->next) fprintf(Dp, "%d ", ptr->len); fprintf(Dp, "\n"); for ( i = 1; i <= length; ) { t = stg; for ( k = i; k <= i+59 && k <= length; k++ ) { a = rect[h][k]; if ( a == ' ' ) a = '-'; *t++ = a; } *t++ = '\0'; fprintf(Fp, "%s\n", stg); i = k; } } } symptr Getnode (length) int length; { char *ckalloc(); /* space-allocating function */ symptr node; if (dlist != NULL) { dlist->len = length; node = dlist; node->next = NULL; dlist = dlist->next; } else { node = (symptr) ckalloc(sizeof(sym)); node->len = length; node->next = NULL; } return node; } /* lib.c - library of C procedures. */ /* fatal - print message and die */ fatal(msg) char *msg; { fprintf(stderr, "%s\n", msg); exit(1); } /* fatalf - format message, print it, and die */ fatalf(msg, val) char *msg, *val; { fprintf(stderr, msg, val); putc('\n', stderr); exit(1); } /* ckopen - open file; check for success */ FILE *ckopen(name, mode) char *name, *mode; { FILE *fopen(), *fp; if ((fp = fopen(name, mode)) == NULL) fatalf("Cannot open %s.", name); return(fp); } /* ckalloc - allocate space; check for success */ char *ckalloc(amount) int amount; { char *malloc(), *p; if ((p = malloc( (unsigned) amount)) == NULL) fatal("Ran out of memory."); return(p); }