/***************************************/ /* Weeder 1.4 see LICENSE.txt file for terms and conditions of use */ #define MAXSEQ 100000 #define CPS CLOCKS_PER_SEC #include #include #include #include #include #include #include #include #include #include #include char **sequence; int *lens; int winlen = 0; char alphabet[4] = { 'A', 'C', 'G', 'T' }; char **name; double eightmers[65536]; double sixmers[4096]; double tenmers[1048576]; int mycounter = 0; int totseq = 0; int totlen = 0; int MINREP = 2; double repperc = -1; int MAXERR = -1; int localerror; double percent = 0.0; FILE *F; int *besthere; int *inbestseq; time_t starttime; time_t endtime; int verbose = 0; int reversed = 0; int onlybestflag = 1; char organism[80] = ""; struct pattern { double sig; double score; unsigned short int counter; unsigned short int searchcount; char *pat; struct pattern *next; }; struct pattern *headpat; struct pattern *endpat; struct pattern *seekpat; int chartoint (char car); int min (int a, int b); struct pattern *mergesrt (struct pattern *primo, int quanti); struct pattern *merge (struct pattern *uno, struct pattern *due); char revcomp (char c); int readfreqs (void); double compfreqs (char *pat, char *pos, int err); double exactfreqs (char *pat, int errs); double newtwmers (char *pat, char *pos, int err); double tentotwelve (char *pat); char *reversepat (char *pat); char inttochar (int num); double exactfreqswithcheck (char *pat, int errs, char *check); double compfreqswithcheck (char *pat, char *pos, int err, char *check); double newtwmerswithcheck (char *inpat, char *pos, int err, char *check); int isvalid (char car); int checkmotifanderror (int motlen, int errs); FILE *OF; FILE *MF; FILE *HF; main (int argc, char *argv[]) { int errdef = 0; int uu, pp, qq; int **used; int flagover; double patfreqs[5]; int toreport = 10; int start, end; double tmpfloat; int maah; int lastocc; char revseek[15]; int inbest; double bestmy; double myscore; double *besterr; int jj; int bestsofar; int partial; int bestpos; int i, j, k, l; int krev; int howmany = 0; int lastinseq = -1; int howmanyseq = 0; int n; int nchar; char buffer[MAXSEQ]; char infile[128] = ""; int len; int perint; int extfile = 0; int partiallen = 0; FILE *G; char outfile[800] = ""; char mixfile[800] = ""; char browsefile[800] = ""; if (argc < 4) { fprintf (stderr, "\nUsage : weederTFBS.out -f inputfilename -O speciescode -W motif_width -e error -R repeatspercentage \n"); fprintf (stderr, "\nspeciescode: two-letter species code (e.g. HS, MM, RN, SC, and so on)\n"); fprintf (stderr, "-W : motif length\n"); fprintf (stderr, "-e : number of mutations allowed\n"); fprintf (stderr, "-R : sequences percentage on which motif has to appear\n"); fprintf (stderr, "\noptions (not required):\n"); fprintf (stderr, "-S: process both strands of input sequences (default: single strand)\n"); fprintf (stderr, "-M: multiple motif occurrences in each sequence (default: expect zero or one occurrences per sequence)\n"); fprintf (stderr, "-T : report top motifs\n"); fprintf (stderr, "-V: verbose mode\n"); exit (1); } i = 1; while ((i < argc)) { switch (argv[i][1]) { case 'O': if ((i + 1 < argc) && (argv[i + 1][0] != '-')) { strcpy (organism, argv[i + 1]); i = i + 2; } else { fprintf (stderr, "\nProblem in organism definition\n"); exit (1); } break; case 'W': if ((i + 1 < argc) && (argv[i + 1][0] != '-')) { winlen = atoi (argv[i + 1]); if ((winlen != 6) && (winlen != 8) && (winlen != 10) && (winlen != 12)) { fprintf (stderr, "\nAccepted motif lengths are even values between 6 and 12\n"); exit (1); } i = i + 2; } else { fprintf (stderr, "\nProblem in motif length definition\n"); exit (1); } break; case 'T': if ((i + 1 < argc) && (argv[i + 1][0] != '-')) { toreport = atoi (argv[i + 1]); i = i + 2; } else { fprintf (stderr, "\nProblem in number of reported motifs definition\n"); exit (1); } break; case 'e': if ((i + 1 < argc) && (argv[i + 1][0] != '-')) { MAXERR = atoi (argv[i + 1]); if ((MAXERR < 0)) { fprintf (stderr, "\nError in defining mismatch number\n"); exit (1); } i = i + 2; errdef = 1; } else { fprintf (stderr, "\nError in defining mismatch number\n"); exit (1); } break; case 'R': if ((i + 1 < argc) && (argv[i + 1][0] != '-')) { MINREP = atoi (argv[i + 1]); if ((MINREP < 1) || (MINREP > 100)) { fprintf (stderr, "\nError in defining repeat percentage -R %d\n", MINREP); exit (1); } i = i + 2; repperc = MINREP; } else { fprintf (stderr, "\nError in defining repeat percentage -R\n"); exit (1); } break; case 'f': if (strcmp ("-f", argv[i]) == 0) { if (i + 1 < argc) { strcpy (infile, argv[i + 1]); extfile = 1; i = i + 2; } else { fprintf (stderr, "\nDid not understand filename\n"); exit (1); } } break; case 'V': verbose = 1; i++; break; case 'S': reversed = 1; i++; break; case 'N': i++; break; case 'M': onlybestflag = 0; i++; break; default: fprintf (stderr, "\nSomething wrong with the commandline\n"); fprintf (stderr, "\nUsage : weederTFBS.out -f inputfilename -O speciescode -W motif_width -e error -R repeatspercentage \n"); fprintf (stderr, "\nspeciescode: two-letter species code (e.g. HS, MM, RN, SC, and so on)\n"); fprintf (stderr, "-W : motif length\n"); fprintf (stderr, "-e : number of mutations allowed\n"); fprintf (stderr, "-R : sequences percentage on which motif has to appear\n"); fprintf (stderr, "\noptions (not required):\n"); fprintf (stderr, "-S: process both strands of input sequences (default: single strand)\n"); fprintf (stderr, "-M: multiple motif occurrences in each sequence (default: expect zero or one occurrences per sequence)\n"); fprintf (stderr, "-T : report top motifs\n"); fprintf (stderr, "-V: verbose mode\n"); exit (1); break; } } if (errdef == 0) { fprintf (stderr, "\nMissing error number -e\n"); exit (1); } if (extfile == 0) { fprintf (stderr, "\nMissing input filename\n"); exit (1); } G = fopen (infile, "r"); if (G == NULL) { fprintf (stderr, "\nNo such file : %s\n", infile); exit (1); } if (winlen == 0) { fprintf (stderr, "\nIllegal motif length\n"); exit (1); } if (checkmotifanderror (winlen, MAXERR) == 0) { fprintf (stderr, "\nIllegal combination of length %d and mismatches %d\n", winlen, MAXERR); exit (1); } readfreqs (); /* Conteggio lunghezza sequenze */ partiallen = 0; l = 0; nchar = 0; totseq = 0; lens = (int *) malloc (MAXSEQ * sizeof (int)); bzero (buffer, MAXSEQ); nchar = fscanf (G, "\n%[^\n]", buffer); if (buffer[0] != '>') { fprintf (stderr, "\nInput file not in FASTA format\nExiting program\n"); exit (0); } while (nchar != EOF) { totseq++; nchar = fscanf (G, "\n%[^\n]", buffer); while ((buffer[0] != '>') && (nchar != EOF)) { partiallen += strlen (buffer); bzero (buffer, MAXSEQ); nchar = fscanf (G, "\n%[^\n]", buffer); } lens[l++] = partiallen; partiallen = 0; } /* Fine conteggio lunghezza sequenze */ fclose (G); G = fopen (infile, "r"); if (G == NULL) { fprintf (stderr, "\nNo such file : %s\n", infile); exit (1); } besthere = (int *) malloc (totseq * sizeof (int)); inbestseq = (int *) malloc (totseq * sizeof (int)); sequence = (char **) malloc ((totseq + 1) * sizeof (char *)); used = (int **) malloc ((totseq + 1) * sizeof (int *)); name = (char **) malloc ((totseq + 1) * sizeof (char *)); for (i = 0; i < totseq + 1; i++) { name[i] = (char *) malloc ((20) * (sizeof (char))); sequence[i] = (char *) malloc ((lens[i] + 1 + 20 + winlen) * (sizeof (char))); used[i] = (int *) malloc ((lens[i]) * sizeof (int)); } if(totseq > MAXSEQ) { fprintf (stderr, "\nSorry! At most %d sequences can be input!\n", MAXSEQ); exit(1); } for (i = 0; i < totseq; i++) { for (j = 0; j < lens[i]; j++) used[i][j] = 0; } bzero (buffer, MAXSEQ); l = 0; fscanf (G, "\n%[^\n]", buffer); strncpy (sequence[l], buffer, 20); ERE: while (l < totseq) { if (buffer[0] == '>') { strncpy (name[l], buffer, 19); bzero (sequence[l], lens[l] + 20 + 1); bzero (buffer, MAXSEQ); nchar = fscanf (G, "\n%[^\n]", buffer); while ((buffer[0] != '>') && (nchar != EOF)) { strcat (sequence[l], buffer); bzero (buffer, MAXSEQ); nchar = fscanf (G, "\n%[^\n]", buffer); } } len = strlen (sequence[l]); if (verbose == 1) fprintf (stderr, "Sequence %s is %d letters long\n", name[l], len); lens[l] = strlen (sequence[l]); totlen += lens[l]; l++; } besterr = (double *) malloc ((MAXERR + 1) * sizeof (double)); if (repperc < 0) { fprintf (stderr, "\nUndefined repeat percentage -R\n"); exit (1); } repperc = (repperc * totseq / 100.0); MINREP = ceil (repperc); sprintf (outfile, "%s.wee", infile); sprintf (mixfile, "%s.mix", infile); sprintf (browsefile, "%s.html", infile); OF = fopen (outfile, "a"); MF = fopen (mixfile, "a"); HF = fopen (browsefile, "a"); fprintf (stderr, "\nSearching for motifs of length %d with %d mutations in file %s.....\n", winlen, MAXERR, infile); fprintf (OF, "\nSearching for motifs of length %d with %d mutations.....\n", winlen, MAXERR); fprintf (HF, "
Searching for motifs of length %d with %d mutations.....

", winlen, MAXERR); if (verbose == 1) fprintf (stderr, "\nMinimum sequence repeats : %d out of %d\n", MINREP, totseq); fprintf (OF, "(Actual commandline is : \n"); fprintf (HF, "(Actual commandline is :
"); for (i = 0; i < argc; i++) { fprintf (OF, "%s ", argv[i]); fprintf (HF, "%s ", argv[i]); } fprintf (OF, ")\n"); fprintf (HF, ")
"); for (l = 0; l < totseq; l++) for (i = 0; i < lens[l]; i++) { sequence[l][i] = toupper (sequence[l][i]); if (isvalid (sequence[l][i]) == 0) { if (i - winlen + 1 < 0) start = 0; else start = i - winlen + 1; //if (i + winlen > lens[l]) //end = lens[l]; //else end = i; for (j = start; j <= end; j++) used[l][j] = 1; } } starttime = clock () / CPS; headpat = (struct pattern *) malloc (sizeof (struct pattern)); endpat = (struct pattern *) malloc (sizeof (struct pattern)); endpat->score = -111111.0; headpat->next = endpat; l = 0; while (l < totseq) { if (verbose == 1) fprintf (stderr, "Processing sequence %s\n", name[l]); for (i = 0; i < lens[l] - winlen; i++) { if ((verbose == 1) && ((i % 100) == 0)) fprintf (stderr, "Position %d\n", i); if ((used[l][i] == 0)) { used[l][i] = 1; seekpat = (struct pattern *) malloc (sizeof (struct pattern)); seekpat->pat = (char *) malloc ((winlen + 1) * sizeof (char)); bzero (seekpat->pat, winlen + 1); seekpat->counter = l; seekpat->searchcount = i; seekpat->sig = 1.0; mycounter++; for (j = 0; j < winlen; j++) seekpat->pat[j] = toupper (sequence[l][i + j]); seekpat->next = headpat->next; headpat->next = seekpat; seekpat = headpat; myscore = 0.0; howmany = 0; howmanyseq = 0; if (reversed == 1) { bzero (revseek, 15); j = strlen (seekpat->next->pat) + 1; for (uu = 0; uu <= j - 2; uu++) { revseek[uu] = revcomp (seekpat->next->pat[j - 2 - uu]); } } /* QUI PER SINGOLO STRAND */ if (reversed != 1) { for (uu = 0; uu <= MAXERR; uu++) { besterr[uu] = -1.0; } lastinseq = -1; for (uu = 0; uu < totseq; uu++) { inbestseq[uu] = -1; besthere[uu] = -1; inbest = 0; lastocc = -1; bestsofar = 1000; bestpos = -1; inbest = 0; localerror = MAXERR; maah = (lens[uu]) - strlen (seekpat->next->pat) + 1; if ((maah > 0) && (howmanyseq + totseq - uu >= MINREP)) { for (j = 0; j < maah; j++) { k = 0; for (pp = 0; pp < winlen; pp++) { if (toupper (sequence[uu][j + pp]) != (seekpat->next->pat[pp])) { k++; } if (k > localerror) break; } if ((k <= localerror)) { if (k == 0) used[uu][j] = 1; howmany++; lastocc = j; if ((k == bestsofar) && (lastinseq == uu)) { inbest++; } if (lastinseq != uu) { lastinseq = uu; howmanyseq++; bestpos = j; bestsofar = k; inbest = 1; } if ((k < bestsofar)) { bestpos = j; bestsofar = k; inbest = 1; if (onlybestflag == 1) localerror = bestsofar; } k = 0; } } } if (bestpos > -1) { if (besterr[bestsofar] == -1) besterr[bestsofar] = 1; besthere[uu] = bestsofar; inbestseq[uu] = inbest; } } if ((howmanyseq >= MINREP)) { patfreqs[0] = exactfreqs (seekpat->next->pat, 0); for (uu = 1; uu <= MAXERR; uu++) { if (winlen != 12) patfreqs[uu] = exactfreqs (seekpat->next->pat, uu); else patfreqs[uu] = exactfreqs (seekpat->next->pat, uu) + patfreqs[uu - 1]; } seekpat->next->score = 0.0; myscore = 0.0; for (uu = 0; uu < totseq; uu++) { if (besthere[uu] >= 0) tmpfloat = log ((double) inbestseq[uu]) - (log (patfreqs[besthere[uu]]) + log ((double) lens[uu])); else tmpfloat = 0; myscore += tmpfloat; } seekpat->next->score = (myscore); if (onlybestflag != 1) seekpat->next->score += log ((double) howmany) - (log (patfreqs[MAXERR]) + log ((double) totlen)); } else { seekpat->next->score = -1.0; } // seekpat = seekpat->next; } else { for (uu = 0; uu <= MAXERR; uu++) { besterr[uu] = -1.0; } howmany = 0; howmanyseq = 0; lastinseq = -1; myscore = 0.0; for (uu = 0; uu < totseq; uu++) { inbestseq[uu] = -1; besthere[uu] = -1; bestsofar = 1000; bestpos = -1; maah = (lens[uu]) - winlen + 1; localerror = MAXERR; if ((maah > 0) && (howmanyseq + totseq - uu >= MINREP)) { for (j = 0; j < maah; j++) { k = 0; krev = 0; for (pp = 0; pp < winlen; pp++) { if ((sequence[uu][j + pp]) != (seekpat->next->pat[pp])) { k++; } if ((sequence[uu][j + pp]) != ((revseek[pp]))) { krev++; } if ((k > localerror) && (krev > localerror)) break; } if ((k <= localerror) && (k <= krev)) { howmany++; if (k == 0) used[uu][j] = 1; flagover = 0; if ((j - winlen + 1 > lastocc) || (lastocc == -1)) { flagover = 1; } if ((k == bestsofar) && (lastinseq == uu)) inbest++; if (lastinseq != uu) { inbest = 1; lastinseq = uu; howmanyseq++; bestpos = j; bestsofar = k; } if (k < bestsofar) { inbest = 1; bestpos = j; bestsofar = k; if (onlybestflag == 1) localerror = bestsofar; } } if ((krev <= localerror)&&(krev < k)) { if (krev == 0) used[uu][j] = 1; howmany++; flagover = 0; if ((j - winlen + 1 > lastocc) || (lastocc == -1)) { flagover = 1; } if ((krev == bestsofar) && (lastinseq == uu)) inbest++; if (lastinseq != uu) { inbest = 1; lastinseq = uu; howmanyseq++; bestpos = j; bestsofar = krev; } if ((krev < bestsofar)) { inbest = 1; bestpos = j; bestsofar = krev; if (onlybestflag == 1) localerror = bestsofar; } k = 0; } } } if (bestpos > -1) { besthere[uu] = bestsofar; inbestseq[uu] = inbest; } } if ((howmanyseq >= MINREP)) { patfreqs[0] = exactfreqs (seekpat->next->pat, 0) + exactfreqswithcheck (revseek, 0, seekpat->next-> pat); for (uu = 1; uu <= MAXERR; uu++) { if (winlen != 12) patfreqs[uu] = exactfreqs (seekpat->next->pat, uu) + exactfreqswithcheck (revseek, uu, seekpat-> next-> pat); else patfreqs[uu] = exactfreqs (seekpat->next->pat, uu) + exactfreqswithcheck (revseek, 0, seekpat-> next-> pat) + patfreqs[uu - 1]; } seekpat->next->score = 0.0; myscore = 0.0; for (uu = 0; uu < totseq; uu++) { if (besthere[uu] > -1) tmpfloat = log ((double) inbestseq[uu]) - (log (patfreqs[besthere[uu]]) + log (lens[uu])); else tmpfloat = 0.0; myscore += tmpfloat; } seekpat->next->score = (myscore); if (onlybestflag != 1) seekpat->next->score += log ((double) howmany) - (log (patfreqs[MAXERR]) + log ((double) totlen)); } else { seekpat->next->score = -1; //mycounter++; } /* if (search_complement (seekpat) == 1) { fprintf (stderr, "\nREVERSE COMPLEMENT : %s\n", seekpat->next->next->pat); //fprintf (stderr, "\nREVERSE COMPLEMENT : %s \n", //seekpat->next->next->pat); seekpat->next->next->score = -10000; i++; } */ } } //else //fprintf(stderr, "USED!! %d %d\n", l,i); } l++; } if (headpat->next != endpat) headpat->next = mergesrt (headpat->next, mycounter); seekpat = headpat; i = 0; while ((seekpat->next->score > 0) && (i < toreport)) { i++; fprintf (stderr, "%d) %s %.2f\n", i, seekpat->next->pat, seekpat->next->score/totseq); fprintf (OF, "%d) %s %.2f \n", i, seekpat->next->pat, seekpat->next->score/totseq); fprintf (HF, "%d) %s %.2f
", i, seekpat->next->pat, seekpat->next->score/totseq); fprintf (MF, "%d) %s %.2f %d\n", i, seekpat->next->pat, seekpat->next->score/totseq, MAXERR); seekpat = seekpat->next; } endtime = clock () / CPS; fprintf (OF, "\nElapsed time : %d min. %d sec.\n", (endtime - starttime) / 60, (endtime - starttime) % 60); fprintf (HF, "
Elapsed time : %d min. %d sec.
", (endtime - starttime) / 60, (endtime - starttime) % 60); fprintf (stderr, "Elapsed time : %d min. %d sec.", (endtime - starttime) / 60, (endtime - starttime) % 60); fprintf (stderr, "\n"); fprintf (OF, "\n"); fprintf (HF, "
"); fclose (OF); fclose (MF); fclose (HF); fprintf (stderr, "\nOutput written in files %s and %s\n", outfile, browsefile); exit (0); } int min (int a, int b) { if (a < b) return a; else return b; } int chartoint (char car) { int ret; switch (tolower (car)) { case 'a': return 0; break; case 'c': return 1; break; case 'g': return 2; break; case 't': return 3; break; case '$': return 4; break; default: return -1; break; } } struct pattern * mergesrt (struct pattern *primo, int quanti) { int i, j, k; int first, second; double mid; struct pattern *l1, *l2; struct pattern *ans; if (quanti == 1) return primo; else { l1 = primo; l2 = primo; mid = (double) (quanti) / 2; if (mid > floor (mid)) { mid = mid - .5; first = (int) mid; second = first + 1; } else { first = (int) mid; second = (int) mid; } for (i = 0; i < first - 1; i++) l2 = l2->next; ans = l2; l2 = l2->next; ans->next = endpat; l1 = mergesrt (l1, first); l2 = mergesrt (l2, second); ans = merge (l1, l2); return ans; } } struct pattern * merge (struct pattern *uno, struct pattern *due) { struct pattern *ret, *first; int flag = 0; if ((uno->score > due->score) || ((uno->score == due->score) && (uno->counter >= due->counter))) { first = uno; uno = uno->next; ret = first; ret->next = endpat; } else { first = due; due = due->next; ret = first; ret->next = endpat; } while (flag == 0) { if ((uno->score > due->score) || ((uno->score == due->score) && (uno->counter >= due->counter))) { if (uno->score == -111111) flag = 1; else { ret->next = uno; uno = uno->next; ret = ret->next; ret->next = endpat; } } else { ret->next = due; due = due->next; ret = ret->next; ret->next = endpat; } } return first; } char revcomp (char c) { char ret; switch (tolower (c)) { case 'a': ret = 'T'; break; case 'c': ret = 'G'; break; case 'g': ret = 'C'; break; case 't': ret = 'A'; break; default: ret = c; break; } return ret; } int readfreqs (void) { double checksum = 0; double decsum = 0; int pre; int post; int backpre, backpost; double backfreqpre, backfreqpost; double decfreq; double prefreq, postfreq; int sum; int i, j, k, n; int i1, i2, i3, i4, i5, i6, i7, i8; double totalcountsix; double totalcounteight; FILE *F; int tmpint; int quale = 0; char base[20] = ""; char freqfname[40] = ""; char decfname[20] = ""; char decamer[10]; int eightmer; int deccount; if (verbose == 1) fprintf (stderr, "\nComputing frequencies...\n"); if (strcmp (organism, "") == 0) { fprintf (stderr, "\nUndefined organism (must add -O)\n"); exit (0); } for (i = 0; i < strlen (organism); i++) organism[i] = toupper (organism[i]); sprintf (freqfname, "./FreqFiles/%s.8.freq", organism); F = fopen (freqfname, "r"); if (F == NULL) { fprintf (stderr, "\nCannot find frequency file %s\nWrong organism code, or the FreqFiles directory is not correctly positioned\n", freqfname); exit (0); } i = 0; n = fscanf (F, "%*s %d\n", &tmpint); sum = 0; while (n != EOF) { sum += tmpint; eightmers[i] = (double) tmpint; i++; n = fscanf (F, "%*s %d\n", &tmpint); } fclose (F); totalcounteight = sum; checksum = 0; for (i = 0; i < 65536; i++) { eightmers[i] /= (double) totalcounteight; checksum += eightmers[i]; } //fprintf (stderr, "\nEightmers : %g", checksum); sprintf (freqfname, "./FreqFiles/%s.6.freq", organism); F = fopen (freqfname, "r"); if (F == NULL) { fprintf (stderr, "\nCannot find frequency file %s\n", freqfname); exit (0); } i = 0; sum = 0; n = fscanf (F, "%*s %d\n", &tmpint); while (n != EOF) { sum += tmpint; sixmers[i] = (double) tmpint; i++; n = fscanf (F, "%*s %d\n", &tmpint); } totalcountsix = sum; for (i = 0; i < 4096; i++) sixmers[i] /= (double) totalcountsix; if (winlen >= 10) { for (i = 0; i < 4; i++) { decamer[0] = inttochar (i); eightmer = 0; for (i1 = 0; i1 < 4; i1++) { decamer[1] = inttochar (i1); eightmer += (int) pow (4.0, 7.0) * i1; for (i2 = 0; i2 < 4; i2++) { decamer[2] = inttochar (i2); eightmer += (int) pow (4.0, 6.0) * i2; for (i3 = 0; i3 < 4; i3++) { decamer[3] = inttochar (i3); eightmer += (int) pow (4.0, 5.0) * i3; for (i4 = 0; i4 < 4; i4++) { decamer[4] = inttochar (i4); eightmer += (int) pow (4.0, 4.0) * i4; for (i5 = 0; i5 < 4; i5++) { decamer[5] = inttochar (i5); eightmer += (int) pow (4.0, 3.0) * i5; for (i6 = 0; i6 < 4; i6++) { decamer[6] = inttochar (i6); eightmer += (int) pow (4.0, 2.0) * i6; for (i7 = 0; i7 < 4; i7++) { decamer[7] = inttochar (i7); eightmer += (int) pow (4.0, 1.0) * i7; for (i8 = 0; i8 < 4; i8++) { decamer[8] = inttochar (i8); pre = eightmer / 4 + (int) pow (4.0, 7.0) * i; backpre = 0; backfreqpre = 0.0; for (k = 0; k < 4; k++) { backpre = eightmer / 4 + (int) pow (4.0, 7.0) * k; backfreqpre += eightmers[backpre]; } prefreq = eightmers[pre] / backfreqpre; eightmer += (int) pow (4.0, 0.0) * i8; for (j = 0; j < 4; j++) { decfreq = prefreq * eightmers[eightmer]; decamer[9] = inttochar (j); post = (eightmer - ((int) pow (4.0, 7.0) * i1)) * 4; backpost = (eightmer - ((int) pow (4.0, 7.0) * i1)) * 4; post += j; backfreqpost = 0; for (k = 0; k < 4; k++) { backpost = backpost + k; backfreqpost += eightmers[backpost]; backpost -= k; } decfreq = decfreq * (eightmers[post] / backfreqpost); postfreq = (eightmers[post] / backfreqpost); decsum += decfreq; deccount = eightmer * 4 + (i * (int) pow (4.0, 9.0)) + j; tenmers[deccount] = decfreq; //fprintf (stderr, "%d %s %f %d %f %f %.10f\n", deccount,decamer,prefreq,eightmer,eightmers[eightmer],postfreq,decfreq); //fprintf (DF, "%.10f\n", decfreq); //if (decfreq <= 0) //{ //fprintf (stderr, "\n%g\n", //decfreq); //getchar (); //} } eightmer -= (int) pow (4.0, 0.0) * i8; } eightmer -= (int) pow (4.0, 1.0) * i7; } eightmer -= (int) pow (4.0, 2.0) * i6; } eightmer -= (int) pow (4.0, 3.0) * i5; } eightmer -= (int) pow (4.0, 4.0) * i4; } eightmer -= (int) pow (4.0, 5.0) * i3; } eightmer -= (int) pow (4.0, 6.0) * i2; } eightmer -= (int) pow (4.0, 7.0) * i1; } } //fprintf (stderr, "%.8f\n", decsum); } return 1; } double compsix (char *pat, char *pos, int err) { int i, j, k; int jj, pp; int nn; double toret = 0; int *appro; int totapp; nn = strlen (pat) - 1; totapp = (int) pow (4.0, err); appro = (int *) malloc ((totapp+1) * sizeof (int)); for (i = 0; i <= totapp; i++) appro[i] = 0; for (jj = 0; jj <= nn; jj++) { if (pos[nn - jj] != '*') { for (i = 0; i < totapp; i++) appro[i] += chartoint ((pat[nn - jj])) * (int) pow (4.0, jj); } else { for (i = 0; i < totapp; i++) appro[i] += (i % 4) * (int) pow (4.0, jj); } } for (i = 0; i < totapp; i++) toret += sixmers[appro[i]]; return toret; } double compfreqs (char *pat, char *pos, int err) { int i, j, k; int jj, pp; int nn; double toret = 0; int *appro; int totapp; int contast = 0; nn = strlen (pat) - 1; totapp = (int) pow (4.0, err); appro = (int *) malloc ((totapp+1) * sizeof (int)); for (i = 0; i <= totapp; i++) appro[i] = 0; for (jj = 0; jj <= nn; jj++) { if (pos[nn - jj] != '*') { for (i = 0; i < totapp; i++) appro[i] += chartoint ((pat[nn - jj])) * (int) pow (4.0, jj); } else { for (i = 0; i < totapp; i++) appro[i] += ((i/((int)pow(4.0,contast))) % 4) * (int) pow (4.0, jj); contast++; } } if (nn == 7) { for (i = 0; i < totapp; i++) { toret += eightmers[appro[i]]; //if(err > 1) //fprintf(stderr, "%d\n",appro[i]); } } if (nn == 9) { for (i = 0; i < totapp; i++) { toret += tenmers[appro[i]]; //if(err > 1) //fprintf(stderr, "%d\n",appro[i]); } } //if(err > 1) //getchar(); free (appro); return toret; } double exactfreqs (char *pat, int errs) { int i, j, k, jj, kk; int patno = 0; double toret = 0.0; char ast[9]; char longast[13]; char empty[9] = " "; char longempty[13] = " "; bzero (ast, 9); bzero (longast, 13); if (strlen (pat) == 6) { for (jj = 0; jj < 6; jj++) patno += chartoint (tolower (pat[5 - jj])) * (int) pow (4.0, jj); if (errs == 0) { return sixmers[patno]; } if (errs == 1) { for (j = 0; j < 6; j++) { strcpy (ast, empty); ast[j] = '*'; toret += compsix (pat, ast, 1); } return (double) toret; } } if (strlen (pat) == 8) { for (jj = 0; jj < 8; jj++) patno += chartoint (tolower (pat[7 - jj])) * (int) pow (4.0, jj); if (errs == 0) { return eightmers[patno]; } if (errs == 1) { for (j = 0; j < 8; j++) { strcpy (ast, empty); ast[j] = '*'; toret += compfreqs (pat, ast, 1); } return (double) toret; } if (errs == 2) { for (j = 0; j < 7; j++) { strcpy (ast, empty); ast[j] = '*'; for (k = j + 1; k < 8; k++) { ast[k] = '*'; if (k > j + 1) ast[k - 1] = ' '; toret += compfreqs (pat, ast, 2); } } return (double) toret; } } if (strlen (pat) == 10) { if (errs == 0) { patno = 0; for (jj = 0; jj < 10; jj++) patno += chartoint (tolower (pat[9 - jj])) * (int) pow (4.0, jj); //toret += newtwmers (pat, longempty,0); toret = tenmers[patno]; return (double) toret; } if (errs == 1) { for (j = 0; j < 10; j++) { strcpy (longast, longempty); longast[j] = '*'; toret += compfreqs (pat, longast, 1); } return (double) toret; } if (errs == 2) { for (j = 0; j < 9; j++) { strcpy (longast, longempty); longast[j] = '*'; for (k = j + 1; k < 10; k++) { longast[k] = '*'; if (k > j + 1) longast[k - 1] = ' '; toret += compfreqs (pat, longast, 2); } } return (double) toret; } if (errs == 3) { for (j = 0; j < 8; j++) { strcpy (longast, longempty); longast[j] = '*'; for (k = j + 1; k < 9; k++) { longast[k] = '*'; //if (k > j + 1) //longast[k - 1] = ' '; for (i = k + 1; i < 10; i++) { longast[i] = '*'; //if (i > k + 1) //longast[i - 1] = ' '; toret += compfreqs (pat, longast, 3); //if(i == 9) longast[i] = ' '; } longast[k] = ' '; } } return (double) toret; } } if (strlen (pat) == 12) { if (errs == 0) { strcpy (longast, longempty); toret += newtwmers (pat, longast, 0); return (double) toret; } if (errs == 1) { for (j = 0; j < 12; j++) { strcpy (longast, longempty); longast[j] = '*'; toret += newtwmers (pat, longast, 1); } return (double) toret; } if (errs == 2) { for (j = 0; j < 11; j++) { strcpy (longast, longempty); longast[j] = '*'; for (k = j + 1; k < 12; k++) { longast[k] = '*'; if (k > j + 1) longast[k - 1] = ' '; toret += newtwmers (pat, longast, 2); longast[k] = ' '; } } return (double) toret; } if (errs == 3) { for (j = 0; j < 10; j++) { strcpy (longast, longempty); longast[j] = '*'; for (k = j + 1; k < 11; k++) { longast[k] = '*'; //if (k > j + 1) //longast[k - 1] = ' '; for (i = k + 1; i < 12; i++) { longast[i] = '*'; //if (i > k + 1) //longast[i - 1] = ' '; toret += newtwmers (pat, longast, 3); longast[i] = ' '; } longast[k] = ' '; } } return (double) toret; } if (errs == 4) { for (j = 0; j < 9; j++) { strcpy (longast, longempty); longast[j] = '*'; for (k = j + 1; k < 10; k++) { longast[k] = '*'; //if (k > j + 1) //longast[k - 1] = ' '; for (i = k + 1; i < 11; i++) { longast[i] = '*'; //if (i > k + 1) //longast[i - 1] = ' '; for (kk = i + 1; kk < 12; kk++) { longast[kk] = '*'; //if (kk > i + 1) //longast[kk - 1] = ' '; toret += newtwmers (pat, longast, 4); longast[kk] = ' '; } longast[i] = ' '; } longast[k] = ' '; } } return (double) toret; } } return 5859.0; } int pattolong (char *pat) { int i, toret; int len; len = strlen (pat) - 1; toret = 0; for (i = 0; i <= len; i++) toret += chartoint ((pat[len - i])) * (int) pow (4.0, i); return toret; } double tentotwelve (char *pat) { int i, j, len; char mid[11], post[11], pre[11], back[11]; double startfreq = 0; double frontfreq = 0; double endfreq = 0; double norm = 0; int prelong, midlong, postlong; int primo, ultimo; bzero (post, 11); bzero (pre, 11); bzero (back, 11); bzero (mid, 11); len = strlen (pat); for (i = 0; i < 10; i++) { mid[i] = pat[i + 1]; post[i] = pat[i + 2]; pre[i] = pat[i]; } strcpy (back, pre); prelong = pattolong (pre); frontfreq = tenmers[prelong]; norm = frontfreq; primo = chartoint (pre[0]); //ATTENZIONE for (i = 0; i < 4; i++) { back[0] = alphabet[i]; prelong -= (primo * pow (4.0, 9.0)); if (alphabet[i] != pre[0]) { primo = i; prelong += (primo * pow (4.0, 9.0)); norm += tenmers[prelong]; } } frontfreq /= norm; strcpy (back, post); postlong = pattolong (post); endfreq = tenmers[postlong]; primo = chartoint (post[9]); norm = endfreq; for (i = 0; i < 4; i++) { back[9] = alphabet[i]; postlong -= primo; if (alphabet[i] != post[9]) { primo = i; postlong += primo; norm += tenmers[postlong]; } } endfreq /= norm; startfreq = tenmers[pattolong (mid)]; return startfreq * frontfreq * endfreq; } double newtwmers (char *inpat, char *pos, int err) { int i, j, jj, k, len; int asts[4]; char pre[9], post[9]; char pat[13]; double toret = 0; double freqs[4]; int patmis; int paterr; for (i = 0; i < 4; i++) freqs[i] = 0.0; len = strlen (inpat); strcpy (pat, inpat); j = 0; for (i = 0; i < len; i++) { if (pos[i] == '*') asts[j++] = i; } bzero (post, 9); bzero (pre, 9); if (err == 0) { return tentotwelve (pat); } if (err == 1) { for (i = 0; i < 4; i++) { paterr = 0; pat[asts[0]] = alphabet[i]; if (pat[asts[0]] != inpat[asts[0]]) toret += tentotwelve (pat); } return toret; } if (err == 2) { for (i = 0; i < 4; i++) { pat[asts[0]] = alphabet[i]; if (pat[asts[0]] != inpat[asts[0]]) { for (j = 0; j < 4; j++) { pat[asts[1]] = alphabet[j]; if (pat[asts[1]] != inpat[asts[1]]) toret += tentotwelve (pat); } } } return toret; } if (err == 3) { toret += tentotwelve (inpat); for (i = 0; i < 4; i++) { pat[asts[0]] = alphabet[i]; if (pat[asts[0]] != inpat[asts[0]]) { for (j = 0; j < 4; j++) { pat[asts[1]] = alphabet[j]; if (pat[asts[1]] != inpat[asts[1]]) { for (k = 0; k < 4; k++) { pat[asts[2]] = alphabet[k]; if (pat[asts[1]] != inpat[asts[1]]) toret += tentotwelve (pat); } } } } } return toret; } if (err == 4) { toret += tentotwelve (inpat); for (i = 0; i < 4; i++) { pat[asts[0]] = alphabet[i]; if (pat[asts[0]] != inpat[asts[0]]) { for (j = 0; j < 4; j++) { pat[asts[1]] = alphabet[j]; if (pat[asts[1]] != inpat[asts[1]]) { for (k = 0; k < 4; k++) { pat[asts[2]] = alphabet[k]; if (pat[asts[2]] != inpat[asts[2]]) { for (jj = 0; jj < 4; jj++) { pat[asts[3]] = alphabet[jj]; if (pat[asts[3]] != inpat[asts[3]]) toret += tentotwelve (pat); } } } } } } } return toret; } } char * reversepat (char *pat) { int len; int i, j, k; char *rev; len = strlen (pat) + 1; rev = (char *) malloc (len + 1); bzero (rev, len + 1); for (i = 0; i <= len - 2; i++) { rev[i] = revcomp (pat[len - 2 - i]); } return rev; } char inttochar (int num) { if (num == 0) return 'A'; if (num == 1) return 'C'; if (num == 2) return 'G'; if (num == 3) return 'T'; } double exactfreqswithcheck (char *pat, int errs, char *check) { int i, j, k, jj, kk; char *partial; int patno = 0; double toret = 0.0; char ast[9]; char longast[13]; char empty[9] = " "; char longempty[13] = " "; bzero (ast, 9); bzero (longast, 13); partial = (char *) malloc (13 * sizeof (char)); bzero (partial, 13); if (strlen (pat) == 6) { for (jj = 0; jj < 6; jj++) patno += chartoint (tolower (pat[5 - jj])) * (int) pow (4.0, jj); if (errs == 0) { if (strcmp (pat, check) != 0) return sixmers[patno]; else return 0.0; } if (errs == 1) { for (j = 0; j < 6; j++) { strcpy (ast, empty); ast[j] = '*'; toret += compsix (pat, ast, 1); } return (double) toret; } } if (strlen (pat) == 8) { for (jj = 0; jj < 8; jj++) patno += chartoint (tolower (pat[7 - jj])) * (int) pow (4.0, jj); if (errs == 0) { if (strcmp (pat, check) != 0) return eightmers[patno]; else return 0.0; } if (errs == 1) { for (j = 0; j < 8; j++) { strcpy (ast, empty); ast[j] = '*'; toret += compfreqswithcheck (pat, ast, 1, check); } return (double) toret; } if (errs == 2) { for (j = 0; j < 7; j++) { strcpy (ast, empty); ast[j] = '*'; for (k = j + 1; k < 8; k++) { ast[k] = '*'; toret += compfreqswithcheck (pat, ast, 2, check); ast[k] = ' '; } } return (double) toret; } } if (strlen (pat) == 10) { if (errs == 0) { patno = 0; for (jj = 0; jj < 10; jj++) patno += chartoint (tolower (pat[9 - jj])) * (int) pow (4.0, jj); //toret += newtwmers (pat, longempty,0); toret = tenmers[patno]; return (double) toret; } if (errs == 1) { for (j = 0; j < 10; j++) { strcpy (longast, longempty); longast[j] = '*'; toret += compfreqswithcheck (pat, longast, 1, check); } return (double) toret; } if (errs == 2) { for (j = 0; j < 9; j++) { strcpy (longast, longempty); longast[j] = '*'; for (k = j + 1; k < 10; k++) { longast[k] = '*'; toret += compfreqswithcheck (pat, longast, 2, check); longast[k] = ' '; } } return (double) toret; } if (errs == 3) { for (j = 0; j < 8; j++) { strcpy (longast, longempty); longast[j] = '*'; for (k = j + 1; k < 9; k++) { longast[k] = '*'; for (i = k + 1; i < 10; i++) { longast[i] = '*'; toret += compfreqswithcheck (pat, longast, 3, check); longast[i] = ' '; } longast[k] = ' '; } } return (double) toret; } } if (strlen (pat) == 12) { if (errs == 0) { strcpy (longast, longempty); if (hamming (pat, check, 0) == 1) toret += newtwmers (pat, longast, 0); else toret = 0; return (double) toret; } if (errs == 1) { for (j = 0; j < 12; j++) { strcpy (longast, longempty); longast[j] = '*'; toret += newtwmerswithcheck (pat, longast, 1, check); } return (double) toret; } if (errs == 2) { for (j = 0; j < 11; j++) { strcpy (longast, longempty); longast[j] = '*'; for (k = j + 1; k < 12; k++) { longast[k] = '*'; toret += newtwmerswithcheck (pat, longast, 2, check); longast[k] = ' '; } } return (double) toret; } if (errs == 3) { for (j = 0; j < 10; j++) { strcpy (longast, longempty); longast[j] = '*'; for (k = j + 1; k < 11; k++) { longast[k] = '*'; for (i = k + 1; i < 12; i++) { longast[i] = '*'; toret += newtwmerswithcheck (pat, longast, 3, check); longast[i] = ' '; } longast[k] = ' '; } } return (double) toret; } if (errs == 4) { for (j = 0; j < 9; j++) { strcpy (longast, longempty); longast[j] = '*'; for (k = j + 1; k < 10; k++) { longast[k] = '*'; for (i = k + 1; i < 11; i++) { longast[i] = '*'; for (kk = i + 1; kk < 12; kk++) { longast[kk] = '*'; toret += newtwmerswithcheck (pat, longast, 4, check); longast[kk] = ' '; } longast[i] = ' '; } longast[k] = ' '; } } return (double) toret; } } return 5859.0; } double compfreqswithcheck (char *pat, char *pos, int err, char *check) { int i, j, k; int jj, pp; int nn; double toret = 0; int *appro; int totapp; char **partial; int contast = 0; nn = strlen (pat) - 1; totapp = (int) pow (4.0, err); partial = (char **) malloc ((totapp + 1) * sizeof (char *)); for (i = 0; i <= totapp; i++) partial[i] = (char *) malloc ((nn + 2) * sizeof (char)); for (i = 0; i <= totapp; i++) bzero (partial[i], nn + 2); appro = (int *) malloc ((totapp+1) * sizeof (int)); for (i = 0; i <= totapp; i++) appro[i] = 0; for (i = 0; i <= totapp; i++) strcpy (partial[i], pat); for (jj = 0; jj <= nn; jj++) { if (pos[nn - jj] != '*') { for (i = 0; i < totapp; i++) appro[i] += chartoint ((pat[nn - jj])) * (int) pow (4.0, jj); } else { for (i = 0; i < totapp; i++) { appro[i] += ((i/((int)pow(4.0,contast))) % 4) * (int) pow (4.0, jj); partial[i][jj] = inttochar (((i/((int)pow(4.0,contast))) % 4)); } contast++; } } if (nn == 7) { for (i = 0; i < totapp; i++) { if ((hamming (partial[i], check, err) != 0)) toret += eightmers[appro[i]]; //fprintf(stderr, "%s\n",partial[i]); } //getchar(); } if (nn == 9) { for (i = 0; i < totapp; i++) { if ((hamming (partial[i], check, err) != 0)) toret += tenmers[appro[i]]; //fprintf(stderr, "%s\n",partial[i]); } //getchar(); } free (appro); for (i = 0; i <= totapp; i++) free (partial[i]); free(partial); return toret; } int hamming (char *pat1, char *pat2, int thre) { int i, len; int err = 0; len = strlen (pat1); for (i = 0; i < len; i++) { if (pat1[i] != pat2[i]) err++; if (err > thre) break; } if (err > thre) return 1; else return 0; } double newtwmerswithcheck (char *inpat, char *pos, int err, char *check) { int i, j, jj, k, len; int asts[4]; char pre[9], post[9]; char pat[13]; char **partial; double toret = 0; len = strlen (inpat); strcpy (pat, inpat); j = 0; for (i = 0; i < len; i++) { if (pos[i] == '*') asts[j++] = i; } bzero (post, 9); bzero (pre, 9); if (err == 0) { if ((hamming (pat, check, err) == 1)) return tentotwelve (pat); else return 0; } if (err == 1) { for (i = 0; i < 4; i++) { pat[asts[0]] = alphabet[i]; if (pat[asts[0]] != inpat[asts[0]]) { if ((hamming (pat, check, err) == 1)) toret += tentotwelve (pat); } } return toret; } if (err == 2) { toret += tentotwelve (inpat); for (i = 0; i < 4; i++) { pat[asts[0]] = alphabet[i]; if (pat[asts[0]] != inpat[asts[0]]) { for (j = 0; j < 4; j++) { pat[asts[1]] = alphabet[j]; if (pat[asts[1]] != inpat[asts[1]]) { if ((hamming (pat, check, err) == 1) && (strcmp (pat, inpat) != 0)) toret += tentotwelve (pat); } } } } return toret; } if (err == 3) { toret += tentotwelve (inpat); for (i = 0; i < 4; i++) { pat[asts[0]] = alphabet[i]; if (pat[asts[0]] != inpat[asts[0]]) { for (j = 0; j < 4; j++) { pat[asts[1]] = alphabet[j]; if (pat[asts[1]] != inpat[asts[1]]) { for (k = 0; k < 4; k++) { pat[asts[2]] = alphabet[k]; if (pat[asts[2]] != inpat[asts[2]]) { if ((hamming (pat, check, err) == 1) && (strcmp (pat, inpat) != 0)) toret += tentotwelve (pat); } } } } } } return toret; } if (err == 4) { toret += tentotwelve (inpat); for (i = 0; i < 4; i++) { pat[asts[0]] = alphabet[i]; if (pat[asts[0]] != inpat[asts[0]]) { for (j = 0; j < 4; j++) { pat[asts[1]] = alphabet[j]; if (pat[asts[1]] != inpat[asts[1]]) { for (k = 0; k < 4; k++) { pat[asts[2]] = alphabet[k]; if (pat[asts[2]] != inpat[asts[2]]) { for (jj = 0; jj < 4; jj++) { pat[asts[3]] = alphabet[jj]; if (pat[asts[3]] != inpat[asts[3]]) { if ((hamming (pat, check, err) == 1) && (strcmp (pat, inpat) != 0)) toret += tentotwelve (pat); } } } } } } } } return toret; } } int isvalid (char car) { int i, j, ret = 1; if ((car != 'A') && (car != 'C') && (car != 'G') && (car != 'T')) ret = 0; return ret; } int checkmotifanderror (int motlen, int errs) { switch (motlen) { case 6: if (errs > 1) return 0; else return 1; break; case 8: if (errs > 3) return 0; else return 1; break; case 10: if (errs > 4) return 0; else return 1; break; case 12: if (errs > 4) return 0; else return 1; break; default: return 0; break; } }