找到DNA Java的超级序列

我正在努力使用“查找超级序列”算法。

输入用于字符串集

String A = "caagccacctacatca"; String B = "cgagccatccgtaaagttg"; String C = "agaacctgctaaatgctaga"; 

结果将是正确对齐的字符串集(下一步应该是合并)

 String E = "ca ag cca cc ta cat ca"; String F = "c gag ccat ccgtaaa g tt g"; String G = " aga acc tgc taaatgc ta ga"; 

谢谢你的任何建议(我在这项任务上坐了一天以上)

合并后,超级字符串将是

 cagagaccatgccgtaaatgcattacga 

“这种情况”中超级序列的定义就像是

当且仅当字符串R中的所有字符按照它们在输入序列R中出现的顺序存在于超序列S中时,字符串R包含在超序列S中。


我试过的“解决方案”(再次以错误的方式)是:

 public class Solution4 { static boolean[][] map = null; static int size = 0; public static void main(String[] args) { String A = "caagccacctacatca"; String B = "cgagccatccgtaaagttg"; String C = "agaacctgctaaatgctaga"; Stack data = new Stack(); data.push(A); data.push(B); data.push(C); Stack clone1 = data.clone(); Stack clone2 = data.clone(); int length = 26; size = max_size(data); System.out.println(size+" "+length); map = new boolean[26][size]; char[] result = new char[size]; HashSet chunks = new HashSet(); while(!clone1.isEmpty()) { String a = clone1.pop(); char[] residue = make_residue(a); System.out.println("---"); System.out.println("OLD : "+a); System.out.println("RESIDUE : "+String.valueOf(residue)); String[] r = String.valueOf(residue).split(" "); for(int i=0; i<r.length; i++) { if(r[i].equals(" ")) continue; //chunks.add(spaces.substring(0,i)+r[i]); chunks.add(r[i]); } } for(String chunk : chunks) { System.out.println("CHUNK : "+chunk); } } static char[] make_residue(String candidate) { char[] result = new char[size]; for(int i=0; i<candidate.length(); i++) { int pos = find_position_for(candidate.charAt(i),i); for(int j=i; j<pos; j++) result[j]=' '; if(pos==-1) result[candidate.length()-1] = candidate.charAt(i); else result[pos] = candidate.charAt(i); } return result; } static int find_position_for(char character, int offset) { character-=((int)'a'); for(int i=offset; imax) max=s.length(); } return max; } } 

找到任何常见的超级序列并不是一项艰巨的任务:

在您的示例中,可能的解决方案将是:

公共类SuperSequenceTest {

 public static void main(String[] args) { String A = "caagccacctacatca"; String B = "cgagccatccgtaaagttg"; String C = "agaacctgctaaatgctaga"; int iA = 0; int iB = 0; int iC = 0; char[] a = A.toCharArray(); char[] b = B.toCharArray(); char[] c = C.toCharArray(); StringBuilder sb = new StringBuilder(); while (iA < a.length || iB < b.length || iC < c.length) { if (iA < a.length && iB < b.length && iC < c.length && (a[iA] == b[iB]) && (a[iA] == c[iC])) { sb.append(a[iA]); iA++; iB++; iC++; } else if (iA < a.length && iB < b.length && a[iA] == b[iB]) { sb.append(a[iA]); iA++; iB++; } else if (iA < a.length && iC < c.length && a[iA] == c[iC]) { sb.append(a[iA]); iA++; iC++; } else if (iB < b.length && iC < c.length && b[iB] == c[iC]) { sb.append(b[iB]); iB++; iC++; } else { if (iC < c.length) { sb.append(c[iC]); iC++; } else if (iB < b.length) { sb.append(b[iB]); iB++; } else if (iA < a.length) { sb.append(a[iA]); iA++; } } } System.out.println("SUPERSEQUENCE " + sb.toString()); } 

}

然而,要解决的真正问题是找到最短共同序列的已知问题的解决方案http://en.wikipedia.org/wiki/Shortest_common_supersequence ,这并不容易。

关于这个主题有很多研究。

例如,见:

http://www.csd.uwo.ca/~lila/pdfs/Towards%20a%20DNA%20solution%20to%20the%20Shortest%20Common%20Superstring%20Problem.pdf

http://www.ncbi.nlm.nih.gov/pubmed/14534185

您可以尝试找到这样的最短组合

 static final char[] CHARS = "acgt".toCharArray(); public static void main(String[] ignored) { String A = "caagccacctacatca"; String B = "cgagccatccgtaaagttg"; String C = "agaacctgctaaatgctaga"; String expected = "cagagaccatgccgtaaatgcattacga"; List ABC = new Combination(A, B, C).findShortest(); System.out.println("expected: " + expected.length()); System.out.println("Merged: " + ABC.get(0).length() + " " + ABC); } static class Combination { int shortest = Integer.MAX_VALUE; List shortestStr = new ArrayList<>(); char[][] chars; int[] pos; int count = 0; Combination(String... strs) { chars = new char[strs.length][]; pos = new int[strs.length]; for (int i = 0; i < strs.length; i++) { chars[i] = strs[i].toCharArray(); } } public List findShortest() { findShortest0(new StringBuilder(), pos); return shortestStr; } private void findShortest0(StringBuilder sb, int[] pos) { if (allDone(pos)) { if (sb.length() < shortest) { shortestStr.clear(); shortest = sb.length(); } if (sb.length() <= shortest) shortestStr.add(sb.toString()); count++; if (++count % 100 == 1) System.out.println("Searched " + count + " shortest " + shortest); return; } if (sb.length() + maxLeft(pos) > shortest) return; int[] pos2 = new int[pos.length]; int i = sb.length(); sb.append(' '); for (char c : CHARS) { if (!tryChar(pos, pos2, c)) continue; sb.setCharAt(i, c); findShortest0(sb, pos2); } sb.setLength(i); } private int maxLeft(int[] pos) { int maxLeft = 0; for (int i = 0; i < pos.length; i++) { int left = chars[i].length - pos[i]; if (left > maxLeft) maxLeft = left; } return maxLeft; } private boolean allDone(int[] pos) { for (int i = 0; i < chars.length; i++) if (pos[i] < chars[i].length) return false; return true; } private boolean tryChar(int[] pos, int[] pos2, char c) { boolean matched = false; for (int i = 0; i < chars.length; i++) { pos2[i] = pos[i]; if (pos[i] >= chars[i].length) continue; if (chars[i][pos[i]] == c) { pos2[i]++; matched = true; } } return matched; } } 

打印许多比建议的更短的解决方案。

预期:28
合并:27 [acgaagccatccgctaaatgctatcga,acgaagccatccgctaaatgctatgca,acgaagccatccgctaacagtgctaga,acgaagccatccgctaacatgctatga,acgaagccatccgctaacatgcttaga,acgaagccatccgctaacatgtctaga,acgaagccatccgctacaagtgctaga,acgaagccatccgctacaatgctatga,acgaagccatccgctacaatgcttaga,acgaagccatccgctacaatgtctaga,acgaagccatcgcgtaaatgctatcga,acgaagccatcgcgtaaatgctatgca,acgaagccatcgcgtaacagtgctaga,acgaagccatcgcgtaacatgctatga,acgaagccatcgcgtaacatgcttaga,acgaagccatcgcgtaacatgtctaga,acgaagccatcgcgtacaagtgctaga,acgaagccatcgcgtacaatgctatga,acgaagccatcgcgtacaatgcttaga,acgaagccatcgcgtacaatgtctaga,acgaagccatgccgtaaatgctatcga,acgaagccatgccgtaaatgctatgca,acgaagccatgccgtaacagtgctaga, acgaagccatgccgtaacatgctatga,acgaagccatgccgtaacatgcttaga,acgaagccatgccgtaacatgtctaga,acgaagccatgccgtacaagtgctaga,acgaagccatgccgtacaatgctatga,acgaagccatgccgtacaatgcttaga,acgaagccatgccgtacaatgtctaga,cagaagccatccgctaaatgctatcga,cagaagccatccgctaaatgctatgca,cagaagccatccgctaacagtgctaga,cagaagccatccgctaacatgctatga,CA gaagccatccgctaacatgcttaga,cagaagccatccgctaacatgtctaga,cagaagccatccgctacaagtgctaga,cagaagccatccgctacaatgctatga,cagaagccatccgctacaatgcttaga,cagaagccatccgctacaatgtctaga,cagaagccatcgcgtaaatgctatcga,cagaagccatcgcgtaaatgctatgca,cagaagccatcgcgtaacagtgctaga,cagaagccatcgcgtaacatgctatga,cagaagccatcgcgtaacatgcttaga,cagaagccatcgcgtaacatgtctaga,cagaagccatcgcgtacaagtgctaga,cagaagccatcgcgtacaatgctatga,cagaagccatcgcgtacaatgcttaga,cagaagccatcgcgtacaatgtctaga,cagaagccatgccgtaaatgctatcga,cagaagccatgccgtaaatgctatgca,cagaagccatgccgtaacagtgctaga,cagaagccatgccgtaacatgctatga,cagaagccatgccgtaacatgcttaga,cagaagccatgccgtaacatgtctaga,cagaagccatgccgtacaagtgctaga,cagaagccatgccgtacaatgctatga,cagaagccatgccgtacaatgcttaga, cagaagccatgccgtacaatgtctaga,cagagaccatccgctaaatgctatcga,cagagaccatccgctaaatgctatgca,cagagaccatccgctaacagtgctaga,cagagaccatccgctaacatgctatga,cagagaccatccgctaacatgcttaga,cagagaccatccgctaacatgtctaga,cagagaccatccgctacaagtgctaga,cagagaccatccgctacaatgctatga,cagagaccatccgcta caatgcttaga,cagagaccatccgctacaatgtctaga,cagagaccatcgcgtaaatgctatcga,cagagaccatcgcgtaaatgctatgca,cagagaccatcgcgtaacagtgctaga,cagagaccatcgcgtaacatgctatga,cagagaccatcgcgtaacatgcttaga,cagagaccatcgcgtaacatgtctaga,cagagaccatcgcgtacaagtgctaga,cagagaccatcgcgtacaatgctatga,cagagaccatcgcgtacaatgcttaga,cagagaccatcgcgtacaatgtctaga,cagagaccatgccgtaaatgctatcga,cagagaccatgccgtaaatgctatgca,cagagaccatgccgtaacagtgctaga,cagagaccatgccgtaacatgctatga,cagagaccatgccgtaacatgcttaga,cagagaccatgccgtaacatgtctaga,cagagaccatgccgtacaagtgctaga,cagagaccatgccgtacaatgctatga,cagagaccatgccgtacaatgcttaga,cagagaccatgccgtacaatgtctaga,cagagccatcctagctaaagtgctaga,cagagccatcctagctaaatgctatga,cagagccatcctagctaaatgcttaga, cagagccatcctagctaaatgtctaga,cagagccatcctgactaaagtgctaga,cagagccatcctgactaaatgctatga,cagagccatcctgactaaatgcttaga,cagagccatcctgactaaatgtctaga,cagagccatcctgctaaatgctatcga,cagagccatcctgctaaatgctatgca,cagagccatcctgctaacagtgctaga,cagagccatcctgctaacatgctatga,cagagccatcctgctaacatgcttaga,C agagccatcctgctaacatgtctaga,cagagccatcctgctacaagtgctaga,cagagccatcctgctacaatgctatga,cagagccatcctgctacaatgcttaga,cagagccatcctgctacaatgtctaga]