Java代理发现机器人

我写了一个类,ProxyFinder连接到随机ips并首先ping它们,如果它们响应,尝试通过公共代理端口创建一个http代理连接。

目前,它只是连接到随机ips。 这相对较快,每小时发现一些代理。 但是,我想以某种方式检查我之前是否已连接到ip。 首先我尝试将它们保存在一个列表中,但是使用了超过10GB的ram ..我在下面的代码中包含了一个方法,该方法使用RandomAccessFile将数据写入缓存,但搜索到的速度非常慢每个连接变大的整个文件。

我以尽可能小的格式存储数据,每个ip只需四个字节。 即使这是4 * 256 * 256 * 256 * 256字节.. = 16gb的原始ram ..或每次要测试另一个ip时搜索的16gb文件。

我还尝试创建一个单独的线程来生成ips,根据文件检查它们,然后将它们添加到探针线程可以从中拉出的队列中。 它无法跟上探测线程。

如何快速检查我是否已连接到IP,而不是非常慢或使用可笑的内存量?

package net; import java.io.File; import java.io.RandomAccessFile; import java.net.HttpURLConnection; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.Proxy; import java.net.URL; import java.util.Arrays; import java.util.concurrent.atomic.AtomicInteger; /** * * @author Colby */ public class ProxyFinder { /** * @param args the command line arguments */ public static void main(String[] args) throws Exception { int[] ports = { 1080, 3128, 3128, 8080 }; System.out.println("Starting network probe"); AtomicInteger counter = new AtomicInteger(); for (int i = 0; i  { do { try { byte[] addrBytes = randomAddress();//could be getNextAddress also if (addrBytes == null) { break; } InetAddress addr = InetAddress.getByAddress(addrBytes); if (ping(addr)) { float percent = (float) ((counter.get() / (256f * 256f * 256f * 256f)) * 100F); if (counter.incrementAndGet() % 10000 == 0) { System.out.println("Searching " + percent + "% network search"); } for (int port : ports) { try { Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(addr, port)); HttpURLConnection con = (HttpURLConnection) new URL("http://google.com").openConnection(proxy); con.setConnectTimeout(1000); con.setReadTimeout(1000); con.setRequestMethod("GET"); con.setRequestProperty("User-Agent", "Mozilla/5.0"); con.getContent(); con.disconnect(); System.out.println("Proxy found!" + addr.getHostAddress() + ":" + port + " Found at " + percent + "% network search"); } catch (Exception e) { } } // //System.out.println("Ping response: --" + addr.getHostAddress() + "-- Attempt: " + counter.get() + " Percent: " + percent + "%"); } else { //System.out.println("Ping response failed: " + addr.getHostAddress() + " attempt " + counter.incrementAndGet()); } } catch (Exception e) { //e.printStackTrace(); } } while (true); }).start(); } } private static RandomAccessFile cache; private static byte[] getNextAddress() throws Exception { if (cache == null) { cache = new RandomAccessFile(File.createTempFile("abc", ".tmp"), "rw"); } byte[] check; checkFile: { byte[] addr = new byte[4]; do { check = randomAddress(); inner: { cache.seek(0); while (cache.length() - cache.getFilePointer() > 0) { cache.readFully(addr); if (Arrays.equals(check, addr)) { break inner; } } cache.write(check); break checkFile; } } while (true); } return check; } private static byte[] randomAddress() { return new byte[]{(byte) (Math.random() * 256), (byte) (Math.random() * 256), (byte) (Math.random() * 256), (byte) (Math.random() * 256)}; } private static boolean ping(InetAddress addr) throws Exception { return addr.isReachable(500); } } 

此外,如果有人想知道,我现在已经运行了12个小时,它发现了大约50个代理,并且大约有2.09664E-4%的ip范围,大约120万ips。 分配的带宽不错(0.5Mbps)

编辑:我开始认为,存储和检查所有这些IP的开销可能比在搜索ip范围的末端附近连接到许多重复项更大。

由于数据量的原因,我不会存储整个IP地址。 将它们存储在BitSet数组中会消耗更少的内存。

编辑删除以前的代码版本,这是不正确的

下面的版本生成随机地址并将其保存在文件中。 如果找到先前运行的持久性文件,则从该文件恢复所看到的地址的信息。

以下情况在初始版本中未正确处理:

 assuming that no address was already seen 1.0.0.1 - seen false 2.0.0.2 - seen false 2.0.0.1 - seen true, which was wrong and is correctly handled by code below 

有关详细信息,请参阅代码中的注释。

 public class KeepSeenAddresses { static final int FILE_BUFFER_SIZE = 81_920; static final int RANGES_SIZE = 256; // to store 256 ranges of 255*255*255+1 addresses static BitSet[] ranges; // Random(1) is taken only for demonstration purpose, so the second // application run will find the same seen addresses from previous run static Random random = new Random(1); // for normal use it's better to have better randomness //static Random random = new Random(System.currentTimeMillis()); public static void main(String[] args) throws IOException, ClassNotFoundException { if (!readRanges()) { initRanges(); } // this case was failing in the initial solution // uncomment this block to see how all edge cases // which where mentioned in other comments are handled /* byte[][] addresses = { {1, 0, 0, 1}, {2, 0, 0, 2}, {2, 0, 0, 1}, {1, 2, 3, 4}, {4, 3, 2, 1}, {(byte)128, 0, 0, 0}, {(byte)255, (byte)255, (byte)255, (byte)255} }; seenAddress(addresses[0]); seenAddress(addresses[1]); seenAddress(addresses[3]); seenAddress(addresses[5]); seenAddress(addresses[6]); for (byte[] addressBytes : addresses) { System.out.printf("seen %s before: %s%n", prettyAddress(addressBytes), seenBefore(addressBytes) ); } */ processAddresses(); persistRanges(); } /** * Read the seen addresses from a file. * * @return true if the file was found and has the expected * number of ranges, otherwise false * @throws IOException * @throws ClassNotFoundException */ private static boolean readRanges() throws IOException, ClassNotFoundException { File rangesStore = new File("addresses.bin"); if (!rangesStore.exists()) { return false; } System.out.print("found previous rangesStore... "); try (ObjectInputStream ois = new ObjectInputStream( new BufferedInputStream( new FileInputStream(rangesStore), FILE_BUFFER_SIZE ) )) { ranges = (BitSet[]) ois.readObject(); } if (ranges.length != RANGES_SIZE) { System.out.printf("wrong size of rangesStore: expected %d" + " found: %d%n", RANGES_SIZE, ranges.length); return false; } else { System.out.printf("restored ranges: %d%n", ranges.length); return true; } } /** * Initialize the address ranges array. All address flags will be set to * false. */ private static void initRanges() { System.out.print("initialize new rangesStore... "); ranges = new BitSet[RANGES_SIZE]; for (int i = 0; i < RANGES_SIZE; i++) { BitSet bitSet = new BitSet(255 * 255 * 255 + 1); for (int j = 0; j < 255 * 255 * 255 + 1; j++) { bitSet.clear(j); } ranges[i] = bitSet; } System.out.printf("initialized ranges: %d%n", RANGES_SIZE); } /** * For demonstration purpose.
* Generates some random IPv4 addresses. If the address was not seen before * the flag for this address will be set to true. */ private static void processAddresses() { for (int i = 0; i < 10; i++) { byte[] addrBytes = randomAddress(); boolean seenBefore = seenBefore(addrBytes); if (!seenBefore) { seenAddress(addrBytes); seenBefore = false; } System.out.printf("seen %s before: %s%n", prettyAddress(addrBytes), seenBefore ); } } /** * Persist the address ranges array. The file size is around 500MB. * * @throws IOException */ private static void persistRanges() throws IOException { System.out.print("persist rangesStore... "); try (ObjectOutputStream oos = new ObjectOutputStream( new BufferedOutputStream( new FileOutputStream("addresses.bin"), FILE_BUFFER_SIZE) )) { oos.writeObject(ranges); } System.out.printf("written ranges: %d%n", ranges.length); } /** * Keep a flag which address has been seen already. * * @param addrBytes IPv4 address in four bytes */ static void seenAddress(byte[] addrBytes) { int rangeIndex = (int) addrBytes[0] & 0xff; int rangeOffset = ((int) addrBytes[1] & 0xff * 0xffff) + ((int) addrBytes[2] & 0xff * 0xff) + ((int) addrBytes[3] & 0xff); ranges[rangeIndex].set(rangeOffset); } /** * Check if the passed address was seen before. * * @param addrBytes IPv4 address in four bytes * @return true if the address was seen before, otherwise * false */ static boolean seenBefore(byte[] addrBytes) { int rangeIndex = (int) addrBytes[0] & 0xff; int rangeOffset = ((int) addrBytes[1] & 0xff * 0xffff) + ((int) addrBytes[2] & 0xff * 0xff) + ((int) addrBytes[3] & 0xff); return ranges[rangeIndex].get(rangeOffset); } /** * Convert the IPv4 address into pretty string. * * @param addrBytes IPv4 address in four bytes * @return pretty String of the IPv4 address */ static String prettyAddress(byte[] addrBytes) { return String.format("%03d.%03d.%03d.%03d", (int) addrBytes[0] & 0xff, (int) addrBytes[1] & 0xff, (int) addrBytes[2] & 0xff, (int) addrBytes[3] & 0xff); } /** * Generate a random IPv4 address. * * @return four bytes of a random generated IPv4 address */ private static byte[] randomAddress() { byte[] bytes = new byte[4]; for (int i = 0; i < bytes.length; i++) { bytes[i] = (byte) random.nextInt(256); } return bytes; } }

我已经从另一个解决方案移植代码以适应这个问题: Java-将多维数组映射到单个

上述问题的答案深入解释了以下代码的工作原理。 如果有人想在这个post上发表更深入的答案,我会给它答案。

 static BitSet set; static int pos(int i, int j, int k, int m) { return ((256*256*256) * i) + ((256*256) * j) + (256 * k) + m; } static boolean get(byte[] addr) { return set.get(pos(addr[0], addr[1], addr[2], addr[3])); } static void set(byte[] addr, boolean flag) { set.set(pos(addr[0], addr[1], addr[2], addr[3]), flag); } 

使用具有1级和2级缓存的MySql和hibernarte等数据库。

如果您使用hibernate配置缓存并调整ur db以使用几个gb的缓存,它将比RAM更快。 我认为他们都这样做。 当配置为生活在另一个具有大小和时间限制的进程+文件时,可以配置像ehcahe这样的外部缓存。 Db知道如何比甚至纯RAM更快地索引和搜索事物 – 大小与IP一样大

另外,您可以通过第一个char,第二个char等分区表数据和索引来改进