/***************************************/ /* Weeder 1.0 see LICENSE.txt file for terms and conditions of use */ #define MAXSEQ 100000 #define CPS CLOCKS_PER_SEC #include #include #include #include #include #include #include #include #include #include #include char **sequence; char **name; int *lens; int winlen = 8; int MINLEN = 8; int MAXGAP = 10000; int *nuclei; int *revnuclei; int gloplen = 0; char alphabet[4] = { 'A', 'C', 'G', 'T' }; int totseq = 0; int uu, pp, qq; int MAXERR = 0; FILE *F; int reversed = 0; struct pattern { int shadowed; int rank; int overlap; int hamming; int inside; int outside; int hor; int error; int vert; double max; double min; int **matrix; float score; int counter; int inshuffled; char *pat; struct pattern *next; struct brother *nextbro; }; struct brother { char pat[14]; struct brother *nextbro; }; struct pattern *headpat[4]; struct pattern *pointer[4]; struct pattern *endpat; struct pattern *seekpat; struct brother *newbro; int chartoint (char car); void insert (int start, int end, int seqno); struct pattern *mergesrt (struct pattern *primo, int quanti); struct pattern *merge (struct pattern *uno, struct pattern *due); char revcomp (char c); char *revpattern (char *a); FILE *OF; FILE *HF; int adviceflag = 1; main (int argc, char *argv[]) { double percentage; int maah; int rankcount; int flagjump; int flagone = 0; int pid; char urlname[200] = ""; char commandline[200] = ""; FILE *L; char **color; double newmin, newmax; int **newmatrix; double mintmp, maxtmp; double tmpflo; struct pattern *newpat; double expected; double revmy; int jj; int partial; int bestpos; int uno, due; int i, j, k, l; int ii, flagnope; int nn; int howmany = 0; int whereavg = 0; int lastinseq = -1; int n; int nchar; char buffer[MAXSEQ]; char infile[400] = "stdin"; char patfile[400] = ""; int len; int perint; double thr; struct pattern *tmppat; char entry[4][6]; int partiallen = 0; int patlimit = 4; int maxflag = 0, repflag = 0, errflag = 0, minflag = 0; FILE *G; char outfile[200] = ""; char browsefile[200] = ""; char logofile[800] = ""; pid = getpid (); color = (char **) malloc (4 * sizeof (char *)); for (i = 0; i < 4; i++) bzero (entry[i], 6); for (i = 0; i < 4; i++) color[i] = (char *) malloc (10); strcpy (color[2], "gold"); strcpy (color[3], "red"); strcpy (color[1], "blue"); strcpy (color[0], "green"); i = 1; lens = (int *) malloc ((MAXSEQ*2) * sizeof (int)); /* Conteggio lunghezza sequenze */ sprintf (infile, "%s", argv[1]); if ((argc > 2) && (strcmp (argv[2], "S") == 0)) reversed = 1; G = fopen (infile, "r"); if (G == NULL) { fprintf (stderr, "\nError in opening file %s\n", infile); exit (0); } partiallen = 0; l = 0; nchar = 0; totseq = 0; bzero (buffer, MAXSEQ); nchar = fscanf (G, "\n%[^\n]", buffer); if (buffer[0] != '>') { fprintf (stderr, "\nInput file not in FASTA format\nExiting program\n"); exit (0); } while (nchar != EOF) { totseq++; nchar = fscanf (G, "\n%[^\n]", buffer); while ((buffer[0] != '>') && (nchar != EOF)) { partiallen += strlen (buffer); bzero (buffer, MAXSEQ); nchar = fscanf (G, "\n%[^\n]", buffer); } lens[l++] = partiallen; partiallen = 0; } /* Fine conteggio lunghezza sequenze */ fclose (G); sprintf (patfile, "%s.mix", infile); sprintf (browsefile, "%s.html", infile); sprintf (outfile, "%s.wee", infile); fprintf (stderr, "\nRunning adviser on file %s\nWriting output on files %s and %s\n", patfile, outfile, browsefile); G = fopen (infile, "r"); if (reversed == 0) sequence = (char **) malloc ((totseq + 1) * sizeof (char *)); else sequence = (char **) malloc ((2 * totseq + 1) * sizeof (char *)); if (reversed == 0) name = (char **) malloc ((totseq + 1) * sizeof (char *)); else name = (char **) malloc ((2 * totseq + 1) * sizeof (char *)); for (i = 0; i < totseq + 1; i++) { name[i] = (char *) malloc ((80) * (sizeof (char))); if(reversed == 0) sequence[i] = (char *) malloc ((lens[i] + 1 + winlen) * (sizeof (char))); else sequence[i] = (char *) malloc (2 * (lens[i] + 1 + winlen) * (sizeof (char))); } if (reversed == 1) for (i = 0; i < totseq; i++) sequence[i + totseq] = (char *) malloc ((lens[i] + 1 + winlen) * sizeof (char)); if (reversed == 1) for (i = 0; i < totseq; i++) name[i + totseq] = (char *) malloc ((80) * (sizeof (char))); bzero (buffer, MAXSEQ); l = 0; fscanf (G, "\n%[^\n]", sequence[l]); ERE: if (sequence[l][0] == '>') { strncpy (name[l], sequence[l], 39); if (reversed == 1) { strcpy (name[l + totseq], "REVCOMP "); strcat (name[l + totseq], name[l]); } bzero (sequence[l], lens[l]); bzero (buffer, MAXSEQ); nchar = fscanf (G, "\n%[^\n]", buffer); while ((buffer[0] != '>') && (nchar != EOF)) { strcat (sequence[l], buffer); bzero (buffer, MAXSEQ); nchar = fscanf (G, "\n%[^\n]", buffer); } } bzero (sequence[l + 1], lens[l + 1]); if (buffer[0] == '>') strcat (sequence[l + 1], buffer); len = strlen (sequence[l]); for (i = 0; i < lens[l]; i++) { sequence[l][i] = tolower (sequence[l][i]); } lens[l] = strlen (sequence[l]); l++; if (l < totseq) goto ERE; if (reversed == 1) { for (l = 0; l < totseq; l++) { for (i = lens[l] - 1; i >= 0; i--) { sequence[l + totseq][lens[l] - i - 1] = revcomp (sequence[l][i]); sequence[l][lens[l]+lens[l]-i-1] = revcomp (sequence[l][i]); } lens[l + totseq] = lens[l]; } //totseq = totseq * 2; } fclose (G); endpat = (struct pattern *) malloc (sizeof (struct pattern)); endpat->score = -111111.0; for (i = 0; i < 4; i++) { headpat[i] = (struct pattern *) malloc (sizeof (struct pattern)); headpat[i]->next = endpat; headpat[i]->counter = 0; } G = fopen (patfile, "r"); nchar = fscanf (G, "\n%[^\n]", buffer); while (nchar != EOF) { flagjump = 0; newpat = (struct pattern *) malloc (sizeof (struct pattern)); newpat->pat = (char *) malloc (15); newpat->hor = 0; newpat->vert = 0; newpat->overlap = 0; newpat->hamming = 0; newpat->inshuffled = 0; newpat->inside = 0; newpat->outside = 0; newpat->shadowed = 0; newpat->nextbro = NULL; bzero (newpat->pat, 15); flagone = 1; sscanf (buffer, "%d) %s %f %d", &newpat->rank, newpat->pat, &newpat->score, &newpat->error); newpat->matrix = (int **) malloc (15 * sizeof (int *)); for (i = 0; i < 15; i++) newpat->matrix[i] = (int *) malloc (4 * sizeof (int)); for (i = 0; i < 15; i++) for (j = 0; j < 4; j++) newpat->matrix[i][j] = 0; i = (strlen (newpat->pat) - 6) / 2; tmppat = headpat[i]->next; while ((tmppat->score > -1111.0) && (flagjump == 0)) { if (strcmp (tmppat->pat, newpat->pat) == 0) { flagjump = 1; if (tmppat->rank == newpat->rank) { if (tmppat->score < newpat->score) { tmppat->score = newpat->score; tmppat->error = newpat->error; } } if (tmppat->rank < newpat->rank) { tmppat->score = newpat->score; tmppat->error = newpat->error; } } tmppat = tmppat->next; } if (flagjump == 0) { newpat->next = headpat[i]->next; headpat[i]->next = newpat; headpat[i]->counter++; } else free (newpat); nchar = fscanf (G, "\n%[^\n]", buffer); } for (i = 0; i < 4; i++) if (headpat[i]->counter > 0) headpat[i]->next = mergesrt (headpat[i]->next, headpat[i]->counter); OF = fopen (outfile, "a"); HF = fopen (browsefile, "a"); if (flagone == 0) { fprintf (OF, "\n\nSorry, no interesting motif found\n\n"); fprintf (HF, ""); fprintf (HF, "

Sorry, no interesting motif found

\n"); fprintf (HF, "
"); fclose (OF); fclose (HF); exit (22); } fprintf (OF, "\nYour sequences:\n\n"); fprintf (HF, "\n
Your sequences:

\n\n"); fprintf (HF, ""); for(j = 0; j < totseq; j++) { fprintf (OF, "Sequence %d : %s\n", j+1,name[j]); fprintf (HF, "Sequence %d : %s
\n", j+1,name[j]); } fprintf (HF, "
"); fprintf (OF, "\n\n**** MY ADVICE ****\n\n"); fprintf (HF, ""); fprintf (HF, "

**** MY ADVICE ****

\n"); fprintf (HF, "
"); for (j = 0; j < 4; j++) { tmppat = headpat[j]->next; while (tmppat->score > -111.0) { for (i = j; i < 4; i++) { seekpat = headpat[i]->next; while (seekpat->score > -111.0) { if (strcmp (seekpat->pat, tmppat->pat) != 0) { if ((overlap (seekpat->pat, tmppat->pat, reversed) == 1) && (tmppat->rank < seekpat->rank)) { tmppat->vert = 1; seekpat->vert = 1; seekpat->overlap = 1; tmppat->overlap = 1; seekpat->inshuffled++; tmppat->inshuffled++; newbro = (struct brother *) malloc (sizeof (struct brother)); strcpy (newbro->pat, tmppat->pat); newbro->nextbro = seekpat->nextbro; seekpat->nextbro = newbro; newbro = (struct brother *) malloc (sizeof (struct brother)); strcpy (newbro->pat, seekpat->pat); newbro->nextbro = tmppat->nextbro; tmppat->nextbro = newbro; } if ((hamming (seekpat->pat, tmppat->pat, reversed) <= seekpat->error) && (tmppat->rank < seekpat->rank)) { tmppat->hamming = 1; seekpat->hamming = 1; tmppat->vert = 1; seekpat->vert = 1; seekpat->inshuffled++; tmppat->inshuffled++; newbro = (struct brother *) malloc (sizeof (struct brother)); strcpy (newbro->pat, tmppat->pat); newbro->nextbro = seekpat->nextbro; seekpat->nextbro = newbro; newbro = (struct brother *) malloc (sizeof (struct brother)); strcpy (newbro->pat, seekpat->pat); newbro->nextbro = tmppat->nextbro; tmppat->nextbro = newbro; } if (strlen (seekpat->pat) != strlen (tmppat->pat)) { if (inside (tmppat->pat, seekpat->pat, reversed) == 1) { tmppat->hor = 1; seekpat->hor = 1; tmppat->inside = 1; seekpat->outside = 1; seekpat->inshuffled++; tmppat->inshuffled++; newbro = (struct brother *) malloc (sizeof (struct brother)); strcpy (newbro->pat, tmppat->pat); newbro->nextbro = seekpat->nextbro; seekpat->nextbro = newbro; newbro = (struct brother *) malloc (sizeof (struct brother)); strcpy (newbro->pat, seekpat->pat); newbro->nextbro = tmppat->nextbro; tmppat->nextbro = newbro; } } } seekpat = seekpat->next; } } tmppat = tmppat->next; } } /****************************************************************/ /* INIZIO RICERCA ESATTA */ /****************************************************************/ nuclei = (int *) malloc (winlen * (sizeof (int))); revnuclei = (int *) malloc (winlen * (sizeof (int))); for (i = 0; i < 4; i++) { seekpat = headpat[i]; while (seekpat->next->score > -111.0) { MAXERR = seekpat->next->error; howmany = 0; lastinseq = -1; expected = 0; if ((((seekpat->next->hor > 0) || (seekpat->next->vert > 2)) && ((seekpat->next->vert > 0))) || ((seekpat->next->rank == 1) && (strlen(seekpat->next->pat) > 6))) { for (uu = 0; uu < totseq; uu++) { bestpos = -1; if(reversed == 0) maah = (lens[uu]) - strlen (seekpat->next->pat) + 1; else maah = (2 * lens[uu]) - strlen (seekpat->next->pat) + 1; for (j = 0; j < maah; j++) { k = 0; flagnope = 0; for (pp = 0; pp < strlen (seekpat->next->pat); pp++) if (chartoint (tolower (sequence[uu][j + pp])) < 0) flagnope = 1; if (flagnope == 0) { for (pp = 0; pp < strlen (seekpat->next->pat); pp++) { nuclei[pp] = chartoint (tolower (sequence[uu][j + pp])); if (tolower (sequence[uu][j + pp]) != tolower (seekpat->next->pat[pp])) { k++; } } } else k = MAXERR + 1; if ((k <= MAXERR) &&((reversed == 0)||((j < lens[uu]- strlen (seekpat->next->pat)+1)||(j >= lens[uu])))) { howmany++; nn = strlen (seekpat->next->pat) - 1; pp = 0; for (jj = 0; jj <= nn; jj++) pp += nuclei[nn - jj] * (int) pow (4.0, jj); for (jj = 0; jj <= nn; jj++) { seekpat->next->matrix[jj][nuclei[jj]]++; } if (lastinseq != uu) { lastinseq = uu; bestpos = j; } k = 0; } } } seekpat->next->min = 0; seekpat->next->max = 0; for (pp = 0; pp < strlen (seekpat->next->pat); pp++) { maxtmp = 0; mintmp = 1; for (qq = 0; qq < 4; qq++) { if (((double) seekpat->next->matrix[pp][qq] / howmany) > maxtmp) maxtmp = (double) seekpat->next->matrix[pp][qq] / howmany; if (((double) seekpat->next->matrix[pp][qq] / howmany) < mintmp) mintmp = (double) seekpat->next->matrix[pp][qq] / howmany; } seekpat->next->min += log (mintmp + 0.001); seekpat->next->max += log (maxtmp); } seekpat->next->counter = howmany; } seekpat = seekpat->next; } } // QUI SCRIVE for (i = 0; i < 4; i++) pointer[i] = headpat[i]; if (headpat[2]->counter <= 0) patlimit = 2; for (rankcount = 0; rankcount < headpat[0]->counter / 2; rankcount++) { for (i = 0; i < 4; i++) { if ((rankcount == 0) && (i == 0)) { fprintf (HF, ""); fprintf (OF, "\n *** Interesting motifs (highest-ranking) seem to be : \n"); fprintf (HF, "
*** Interesting motifs (highest-ranking) seem to be :
"); fprintf (HF, "
"); adviceflag = 0; } if ((rankcount == 1) && (i == 0)) { fprintf (HF, ""); fprintf (OF, "\n *** Interesting motifs (not highest-ranking) can also be : \n"); fprintf (HF, "
*** Interesting motifs (not highest-ranking) can also be :
\n"); fprintf (HF, "
"); adviceflag = 0; } if (pointer[i]->next->score > 0) { seekpat = pointer[i]; pointer[i] = pointer[i]->next; //while (seekpat->next->score > -111.0) { if ((((seekpat->next->hor > 0) || (seekpat->next->vert > 2)) && ((seekpat->next->vert > 0))) || ((seekpat->next->rank == 1) && (strlen(seekpat->next->pat) > 6))) { //bzero (logofile, 200); //sprintf (logofile, "%s.%d.%d.logo", infile, i, //seekpat->next->rank); //L = fopen (logofile, "a"); newmatrix = (int **) malloc (15 * sizeof (int *)); for (j = 0; j < 15; j++) newmatrix[j] = (int *) malloc (4 * sizeof (int)); fprintf (OF, "\n\n%s\n", seekpat->next->pat); if (reversed == 1) fprintf (OF, "%s\n ", revpattern (seekpat->next->pat)); fprintf (HF, ""); fprintf (HF, "

%s
", seekpat->next->pat); if (reversed == 1) fprintf (HF, "%s
", revpattern (seekpat->next->pat)); fprintf (HF, "
"); fprintf (OF, "\n%d redundant motifs found:\n", seekpat->next->inshuffled); fprintf (HF, "
%d redundant motifs found:
", seekpat->next->inshuffled); while (seekpat->next->nextbro != NULL) { fprintf (OF, "%s - ", seekpat->next->nextbro->pat); fprintf (HF, "%s - ", seekpat->next->nextbro->pat); seekpat->next->nextbro = seekpat->next->nextbro->nextbro; } fprintf (OF, "\n"); fprintf (HF, "
"); /* if ((seekpat->next->inside == 1) || (seekpat->next->overlap == 1)) { fprintf (OF, "(which might be part of a longer motif) \n"); fprintf (HF, "(which might be part of a longer motif)
"); } if ((seekpat->next->outside == 1)) { fprintf (OF, "(which contains a shorter motif) \n"); fprintf (HF, "(which contains a shorter motif)
"); } if ((seekpat->next->hamming == 1)) { fprintf (OF, "(there's a motif of the same length very similar to this one) \n"); fprintf (HF, "(there's a motif of the same length very similar to this one)
"); } */ fprintf (OF, "\nBest occurrences: \n"); fprintf (OF, "Seq St oligo pos match\n"); fprintf (HF, "
Best occurrences (match percentage):
"); fprintf (HF, "Seq St oligo pos match\n
"); /* if (i == 0) MAXERR = 1; else MAXERR = i + 1; */ MAXERR = seekpat->next->error; // seekpat->next = bestp[i]; for (uu = 0; uu < 15; uu++) for (j = 0; j < 4; j++) newmatrix[uu][j] = 0; howmany = 0; lastinseq = -1; //seekpat->next->inshuffled = 0; expected = 0; //appast = (char *) malloc (strlen (seekpat->next->pat)); for (uu = 0; uu < totseq; uu++) { //fprintf (OF, "\n%s\n", name[uu]); //fprintf(HF,"%10s
\n", name[uu]); bestpos = -1; //bzero (appast, strlen (seekpat->next->pat)); if(reversed == 0) maah = (lens[uu]) - strlen (seekpat->next->pat) + 1; else maah = (2*lens[uu]) - strlen (seekpat->next->pat) + 1; for (j = 0; j < maah; j++) { k = 0; flagnope = 0; for (pp = 0; pp < strlen (seekpat->next->pat); pp++) if (chartoint (tolower (sequence[uu][j + pp])) < 0) flagnope = 1; if (flagnope == 0) { for (pp = 0; pp < strlen (seekpat->next->pat); pp++) { nuclei[pp] = chartoint (tolower (sequence[uu][j + pp])); } } else k = MAXERR + 1; if ((k <= MAXERR) &&((reversed == 0)||((j < lens[uu]- strlen (seekpat->next->pat)+1)||(j >= lens[uu])))) { nn = strlen (seekpat->next->pat) - 1; tmpflo = 0; for (jj = 0; jj <= nn; jj++) { tmpflo += log ((double) seekpat->next-> matrix[jj][nuclei[jj]]) - log ((double) seekpat->next->counter); //fprintf(OF, "%c", alphabet[nuclei[jj]]); } percentage = (100.0 * (tmpflo - seekpat->next->min) / (seekpat->next->max - seekpat->next->min)); if (percentage > 85) { howmany++; //fprintf (L, "%s\n", name[uu]); //fprintf (HF, "
%.20s
", name[uu]); if(reversed == 0) { fprintf (OF, "%3d + ",uu+1); fprintf (HF, "%3d + ",uu+1); } else { if(j < lens[uu]) { fprintf (OF, "%3d + ",uu+1); fprintf (HF, "%3d + ",uu+1); } else { fprintf (OF, "%3d - ",uu+1); fprintf (HF, "%3d - ",uu+1); } } if (percentage < 90) { fprintf (OF, "["); fprintf (HF, "["); } else { fprintf (OF, " "); fprintf (HF, "."); } for (jj = 0; jj <= nn; jj++) { fprintf (OF, "%c", alphabet[nuclei[jj]]); //fprintf (L, "%c", alphabet[nuclei[jj]]); fprintf (HF, "%c", color[nuclei[jj]], alphabet[nuclei[jj]]); if (percentage >= 90) newmatrix[jj][nuclei[jj]]++; } if (percentage < 90) { fprintf (OF, "]"); fprintf (HF, "]"); } else { fprintf (OF, " "); fprintf (HF, "."); } if ((reversed == 1)) { /* fprintf (OF, "\n"); fprintf (HF, "
"); if (percentage < 90) { fprintf (OF, " ["); fprintf (HF, " ["); } else { fprintf (OF, " "); fprintf (HF, " ."); } for (jj = nn; jj >= 0; jj--) { fprintf (OF, "%c", revcomp (alphabet [nuclei[jj]])); //fprintf (L, "%c", alphabet[nuclei[jj]]); fprintf (HF, "%c", revcomp (alphabet [nuclei[jj]])); } if (percentage < 90) { fprintf (OF, "]"); fprintf (HF, "]"); } else { fprintf (OF, " "); fprintf (HF, "."); } */ if(j > lens[uu]) { fprintf (OF, " %4d ", 2*lens[uu] - j - strlen (seekpat->next->pat) + 1); fprintf (HF, "
%4d ", 2*lens[uu] - j - strlen (seekpat->next->pat) + 1); } else { fprintf (OF, " %4d ", j + 1); fprintf (HF, "
%4d ", j + 1); } } else { fprintf (OF, " %4d ", j + 1); fprintf (HF, "
%4d ", j + 1); } fprintf (OF, " (%.2f) ", percentage); fprintf (HF, " (%.2f) ", percentage); fprintf (OF, "\n"); fprintf (HF, "
\n"); //fprintf (L, "\n"); // fprintf (HF, "
"); } // fprintf(OF, " %f (%d) ", tmpflo, k); //fprintf(OF, " %f ", 100.0*(tmpflo - seekpat->next->min)/(seekpat->next->max - seekpat->next->min)); k = 0; } } } newmin = 0; newmax = 0; for (pp = 0; pp < strlen (seekpat->next->pat); pp++) { maxtmp = 0; mintmp = 1; for (qq = 0; qq < 4; qq++) { if (((double) newmatrix[pp][qq] / howmany) > maxtmp) maxtmp = (double) newmatrix[pp][qq] / howmany; if (((double) newmatrix[pp][qq] / howmany) < mintmp) mintmp = newmatrix[pp][qq] / howmany; } newmin += log (mintmp + 0.001); newmax += log (maxtmp); } fprintf (OF, "\n\t Frequency Matrix \n"); fprintf (OF, "\t All Occurrences \t\t Best Occurrences \n"); fprintf (OF, "\t A C G T\t\t A C G T\n"); fprintf (HF, "
"); fprintf (HF, ""); fprintf (HF, "
Frequency Matrix
All Occs Best Occs"); fprintf (HF, "
ACGT ACGT", color[0], color[1], color[2], color[3], color[0], color[1], color[2], color[3]); for (pp = 0; pp < strlen (seekpat->next->pat); pp++) { sprintf (entry[0], "%d", seekpat->next->matrix[pp][0]); sprintf (entry[1], "%d", seekpat->next->matrix[pp][1]); sprintf (entry[2], "%d", seekpat->next->matrix[pp][2]); sprintf (entry[3], "%d", seekpat->next->matrix[pp][3]); fprintf (OF, "%d \t%5d %5d %5d %5d\t", pp + 1, seekpat->next->matrix[pp][0], seekpat->next->matrix[pp][1], seekpat->next->matrix[pp][2], seekpat->next->matrix[pp][3]); fprintf (OF, "\t%5d %5d %5d %5d\n", newmatrix[pp][0], newmatrix[pp][1], newmatrix[pp][2], newmatrix[pp][3]); fprintf (HF, "
%d \t%d %d %d %d", pp + 1, seekpat->next->matrix[pp][0], seekpat->next->matrix[pp][1], seekpat->next->matrix[pp][2], seekpat->next->matrix[pp][3]); fprintf (HF, "|\t%d %d %d %d", newmatrix[pp][0], newmatrix[pp][1], newmatrix[pp][2], newmatrix[pp][3]); } fprintf (HF, "

"); fprintf (OF, "\n==========================================\n"); fprintf (HF, "


"); } // seekpat = seekpat->next; } } } } if (adviceflag == 1) { fprintf (OF, "\nSorry! No advice on this one.....\n"); fprintf (HF, "
Sorry! No advice on this one.....
"); } fclose (OF); fprintf (HF, ""); fclose (HF); fprintf(stderr, "\nJob completed\n"); exit (0); } int chartoint (char car) { int ret; switch (car) { case 'a': return 0; break; case 'c': return 1; break; case 'g': return 2; break; case 't': return 3; break; case '$': return 4; break; default: return -1; break; } } struct pattern * mergesrt (struct pattern *primo, int quanti) { int i, j, k; int first, second; double mid; struct pattern *l1, *l2; struct pattern *ans; if (quanti == 1) return primo; else { l1 = primo; l2 = primo; mid = (double) (quanti) / 2; if (mid > floor (mid)) { mid = mid - .5; first = (int) mid; second = first + 1; } else { first = (int) mid; second = (int) mid; } for (i = 0; i < first - 1; i++) l2 = l2->next; ans = l2; l2 = l2->next; ans->next = endpat; l1 = mergesrt (l1, first); l2 = mergesrt (l2, second); ans = merge (l1, l2); return ans; } } struct pattern * merge (struct pattern *uno, struct pattern *due) { struct pattern *ret, *first; int flag = 0; if ((uno->score > due->score) || ((uno->score == due->score) && (uno->counter >= due->counter))) { first = uno; uno = uno->next; ret = first; ret->next = endpat; } else { first = due; due = due->next; ret = first; ret->next = endpat; } while (flag == 0) { if ((uno->score > due->score) || ((uno->score == due->score) && (uno->counter >= due->counter))) { if (uno->score == -111111) flag = 1; else { ret->next = uno; uno = uno->next; ret = ret->next; ret->next = endpat; } } else { ret->next = due; due = due->next; ret = ret->next; ret->next = endpat; } } return first; } char revcomp (char c) { char ret; switch (tolower (c)) { case 'a': ret = 'T'; break; case 'c': ret = 'G'; break; case 'g': ret = 'C'; break; case 't': ret = 'A'; break; default: ret = toupper (c); //fprintf(stderr,"\nLe sequenze di input contengono una qualche cazzata : %c\n", c); //exit(2); break; } return ret; } int overlap (char *a, char *b, int checkrev) { int l; int i, j, k; int len; int ans1 = 0; int ans2 = 0; int flag = 1; int toret = 0; int ansrev; char *p; len = strlen (a); flag = 1; for (l = 1; l < 3; l++) { for (i = 0; i < len - l; i++) { if (a[i + l] != b[i]) { flag = 0; ans1 = 0; break; } } if (flag == 1) toret = 1; flag = 1; for (i = 0; i < len - l; i++) { if (a[i] != b[i + l]) { flag = 0; ans2 = 0; break; } } if (flag == 1) toret = 1; } if (checkrev == 1) { p = revpattern (a); ansrev = overlap (p, b, 0); if (ansrev == 1) toret = 1; } return toret; } int hamming (char *a, char *b, int checkrev) { int h = 0, h1 = 9999; int i; int minl; char *p, *q; if (strlen (a) != strlen (b)) return 9999; if (strcmp (a, b) == 0) return 9999; for (i = 0; i < strlen (a); i++) if (a[i] != b[i]) h++; if (checkrev == 0) return h; if (checkrev == 1) { p = revpattern (a); h1 = 0; for (i = 0; i < strlen (a); i++) if (a[i] != b[i]) h1++; } if (h1 < h) h = h1; return h; } int inside (char *a, char *b, int checkrev) { int len1, len2; int i, j, k, match; int ans = 0, ansrev = 0; char *c; len1 = strlen (a); len2 = strlen (b); if (((len2 - len1) != 2)) return 0; for (i = 0; i < len2 - len1 + 1; i++) { match = 1; for (j = i; j < i + len1; j++) { if (a[j - i] != b[j]) { match = 0; break; } } if (match == 1) ans = 1; } if (checkrev == 1) { c = revpattern (a); ansrev = inside (c, b, 0); if (ansrev == 1) ans = 1; } return ans; } char * revpattern (char *a) { int len, i, j; char *toret; len = strlen (a); toret = (char *) malloc ((len + 1) * sizeof (char)); bzero (toret, len + 1); for (i = 0; i < len; i++) toret[i] = toupper (revcomp (a[len - i - 1])); return (toret); }