| CODENOTIFIER | HelpYou are not signed inSign in |
Project: PUNKsearch
Revision: 537
Author: ysoldak
Date: 24 Aug 2008 18:30:32
Changes:Dumping HostCrawler statuses to punksearch-crawl.status was moved from cli to core.
Minor refactorings.
| ... | ...@@ -56,19 +56,9 @@ | |
| 56 | 56 | private List<IpRange> ranges; |
| 57 | 57 | |
| 58 | 58 | private List<HostCrawler> threadList = Collections.synchronizedList(new ArrayList<HostCrawler>()); |
| 59 | private Set<Timer> timers = new HashSet<Timer>(); | |
| 59 | 60 | |
| 60 | /** | |
| 61 | * Constructor extracts configuration from system properties. The system property names are defined by static final | |
| 62 | * fields of Settings class. | |
| 63 | */ | |
| 64 | 61 | private NetworkCrawler() { |
| 65 | this.indexDirectory = PunksearchFs.resolveIndexDirectory(); | |
| 66 | this.forceUnlock = Boolean.valueOf(System.getProperty(Settings.UNLOCK_PROPERTY, "false")); | |
| 67 | this.threadCount = Integer.getInteger(Settings.THREADS_PROPERTY, 5); | |
| 68 | this.fileTypes = FileTypes.readFromDefaultFile(); | |
| 69 | this.daysToKeep = Float.parseFloat(System.getProperty(Settings.KEEPDAYS_PROPERTY, "7")); | |
| 70 | this.maxHours = Integer.getInteger(Settings.MAXHOURS_PROPERTY, 12); | |
| 71 | this.ranges = parseRanges(System.getProperty(Settings.RANGE_PROPERTY)); | |
| 72 | 62 | } |
| 73 | 63 | |
| 74 | 64 | public static NetworkCrawler getInstance() { |
| ... | ...@@ -99,6 +89,7 @@ | |
| 99 | 89 | * Starts the crawling process. Starts all threads, merges temp indexes into main one, clears temp files. |
| 100 | 90 | */ |
| 101 | 91 | public synchronized void run() { |
| 92 | readProperties(); | |
| 102 | 93 | |
| 103 | 94 | if (!prepareAllIndexDirs()) { |
| 104 | 95 | __log.warn("Can't start crawling. Something wrong with an index directory (check log)."); |
| ... | ...@@ -113,8 +104,7 @@ | |
| 113 | 104 | long startTime = new Date().getTime(); |
| 114 | 105 | __log.info("Crawl process started"); |
| 115 | 106 | |
| 116 | Timer processTimer = new Timer(); | |
| 117 | processTimer.schedule(new MaxRunWatchDog(this), maxHours * 3600 * 1000L); | |
| 107 | startTimers(); | |
| 118 | 108 | |
| 119 | 109 | IpIterator iter = new SynchronizedIpIterator(ranges); |
| 120 | 110 | synchronized (threadList) { |
| ... | ...@@ -161,7 +151,41 @@ | |
| 161 | 151 | synchronized (threadList) { |
| 162 | 152 | threadList.clear(); |
| 163 | 153 | } |
| 164 | processTimer.cancel(); | |
| 154 | cancelTimers(); | |
| 155 | } | |
| 156 | ||
| 157 | /** | |
| 158 | * Extracts configuration from system properties. | |
| 159 | * | |
| 160 | * The system property names are defined by static final fields of Settings class. | |
| 161 | */ | |
| 162 | private void readProperties() { | |
| 163 | this.indexDirectory = PunksearchFs.resolveIndexDirectory(); | |
| 164 | this.forceUnlock = Boolean.valueOf(System.getProperty(Settings.UNLOCK_PROPERTY, "false")); | |
| 165 | this.threadCount = Integer.getInteger(Settings.THREADS_PROPERTY, 5); | |
| 166 | this.fileTypes = FileTypes.readFromDefaultFile(); | |
| 167 | this.daysToKeep = Float.parseFloat(System.getProperty(Settings.KEEPDAYS_PROPERTY, "7")); | |
| 168 | this.maxHours = Integer.getInteger(Settings.MAXHOURS_PROPERTY, 12); | |
| 169 | this.ranges = parseRanges(System.getProperty(Settings.RANGE_PROPERTY)); | |
| 170 | } | |
| 171 | ||
| 172 | private void startTimers() { | |
| 173 | Timer processTimer = new Timer(); | |
| 174 | processTimer.schedule(new MaxRunWatchDog(), maxHours * 3600 * 1000L); | |
| 175 | ||
| 176 | Timer statusDumpTimer = new Timer(); | |
| 177 | long dumpPeriod = Long.getLong(Settings.DUMP_STATUS_PERIOD, 10L) * 1000; | |
| 178 | statusDumpTimer.scheduleAtFixedRate(new ThreadStatusDump(), dumpPeriod, dumpPeriod); | |
| 179 | ||
| 180 | timers.add(processTimer); | |
| 181 | timers.add(statusDumpTimer); | |
| 182 | } | |
| 183 | ||
| 184 | private void cancelTimers() { | |
| 185 | for (Timer timer : timers) { | |
| 186 | timer.cancel(); | |
| 187 | } | |
| 188 | timers.clear(); | |
| 165 | 189 | } |
| 166 | 190 | |
| 167 | 191 | private boolean prepareAllIndexDirs() { |
| ... | ...@@ -314,7 +338,7 @@ | |
| 314 | 338 | if (forceUnlock) { |
| 315 | 339 | IndexOperator.unlock(dir); |
| 316 | 340 | } else { |
| 317 | __log.info("Index directory is locked: '" + dir + "' " | |
| 341 | __log.warn("Index directory is locked: '" + dir + "' " | |
| 318 | 342 | + "Consider to set \"*.crawler.forceunlock=true\" in punksearch.properties"); |
| 319 | 343 | return false; |
| 320 | 344 | } |
| ... | ...@@ -324,15 +348,44 @@ | |
| 324 | 348 | } |
| 325 | 349 | |
| 326 | 350 | private class MaxRunWatchDog extends TimerTask { |
| 351 | public void run() { | |
| 352 | __log.info("Stopping crawling due to time limit"); | |
| 353 | NetworkCrawler.getInstance().stop(); | |
| 354 | } | |
| 355 | } | |
| 327 | 356 | |
| 328 | private NetworkCrawler crawler; | |
| 357 | private class ThreadStatusDump extends TimerTask { | |
| 329 | 358 | |
| 330 | public MaxRunWatchDog(NetworkCrawler crawler) { | |
| 331 | this.crawler = crawler; | |
| 332 | } | |
| 359 | public static final String STATUS_FILENAME = "punksearch-crawl.status"; | |
| 333 | 360 | |
| 334 | 361 | public void run() { |
| 335 | crawler.stop(); | |
| 362 | List<HostCrawler> threads = NetworkCrawler.getInstance().getThreads(); | |
| 363 | String dump = ""; | |
| 364 | for (HostCrawler thread : threads) { | |
| 365 | boolean stop = thread.isStopRequested(); | |
| 366 | String status = "unknown"; | |
| 367 | if (stop) { | |
| 368 | if (thread.getIp() != null) { | |
| 369 | status = "stopping"; | |
| 370 | } else { | |
| 371 | status = "stopped manually"; | |
| 372 | } | |
| 373 | } else { | |
| 374 | if (thread.getIp() != null) { | |
| 375 | status = "crawling " + thread.getIp(); | |
| 376 | } else { | |
| 377 | status = "finished successfully"; | |
| 378 | } | |
| 379 | } | |
| 380 | dump += thread.getName() + " : " + status + " : " + thread.getCrawledHosts().size() + "\n"; | |
| 381 | } | |
| 382 | String path = System.getProperty("java.io.tmpdir") + File.separator + STATUS_FILENAME; | |
| 383 | try { | |
| 384 | FileUtils.writeStringToFile(new File(path), dump); | |
| 385 | } catch (IOException e) { | |
| 386 | __log.warn("Can't write crawler status to file: " + path); | |
| 387 | } | |
| 336 | 388 | } |
| 337 | 389 | } |
| 390 | ||
| 338 | 391 | } |
| ... | ...@@ -20,36 +20,38 @@ | |
| 20 | 20 | /** |
| 21 | 21 | * Use this property to customize directory for temporary indexes |
| 22 | 22 | */ |
| 23 | public static final String TMP_DIR_PROPERTY = "org.punksearch.crawler.tmpdir"; | |
| 23 | public static final String TMP_DIR_PROPERTY = "org.punksearch.crawler.tmpdir"; | |
| 24 | 24 | /** |
| 25 | 25 | * Whatever to unlock main and temporary index directories |
| 26 | 26 | */ |
| 27 | public static final String UNLOCK_PROPERTY = "org.punksearch.crawler.forceunlock"; | |
| 27 | public static final String UNLOCK_PROPERTY = "org.punksearch.crawler.forceunlock"; | |
| 28 | 28 | /** |
| 29 | 29 | * Number of threads to use for crawling the network (use values between 1 and 10) |
| 30 | 30 | */ |
| 31 | public static final String THREADS_PROPERTY = "org.punksearch.crawler.threads"; | |
| 31 | public static final String THREADS_PROPERTY = "org.punksearch.crawler.threads"; | |
| 32 | 32 | /** |
| 33 | 33 | * The comma separated list of IP ranges or path to the file with IPs. |
| 34 | 34 | */ |
| 35 | public static final String RANGE_PROPERTY = "org.punksearch.crawler.range"; | |
| 35 | public static final String RANGE_PROPERTY = "org.punksearch.crawler.range"; | |
| 36 | 36 | /** |
| 37 | 37 | * Lifetime of old items in the index (may be real number). |
| 38 | 38 | */ |
| 39 | public static final String KEEPDAYS_PROPERTY = "org.punksearch.crawler.keepdays"; | |
| 39 | public static final String KEEPDAYS_PROPERTY = "org.punksearch.crawler.keepdays"; | |
| 40 | 40 | /** |
| 41 | 41 | * Maximum hours to wait until a crawling thread to finish, then interrupt it. |
| 42 | 42 | */ |
| 43 | public static final String MAXHOURS_PROPERTY = "org.punksearch.crawler.maxhours"; | |
| 43 | public static final String MAXHOURS_PROPERTY = "org.punksearch.crawler.maxhours"; | |
| 44 | 44 | |
| 45 | public static final String DEEP = "org.punksearch.crawler.deep"; | |
| 45 | public static final String DEEP = "org.punksearch.crawler.deep"; | |
| 46 | 46 | |
| 47 | public static final String BOOST_CREATE_DATE = "org.punksearch.crawler.boost.createdate"; | |
| 48 | public static final String BOOST_DEEP = "org.punksearch.crawler.boost.deep"; | |
| 49 | public static final String BOOST_SIZE = "org.punksearch.crawler.boost.size"; | |
| 47 | public static final String BOOST_CREATE_DATE = "org.punksearch.crawler.boost.createdate"; | |
| 48 | public static final String BOOST_DEEP = "org.punksearch.crawler.boost.deep"; | |
| 49 | public static final String BOOST_SIZE = "org.punksearch.crawler.boost.size"; | |
| 50 | 50 | |
| 51 | public static final String HEADER_USE = "org.punksearch.crawler.data.header"; | |
| 52 | public static final String HEADER_LENGTH = "org.punksearch.crawler.data.header.length"; | |
| 53 | public static final String HEADER_THRESHOLD = "org.punksearch.crawler.data.header.threshold"; | |
| 51 | public static final String HEADER_USE = "org.punksearch.crawler.data.header"; | |
| 52 | public static final String HEADER_LENGTH = "org.punksearch.crawler.data.header.length"; | |
| 53 | public static final String HEADER_THRESHOLD = "org.punksearch.crawler.data.header.threshold"; | |
| 54 | ||
| 55 | public static final String DUMP_STATUS_PERIOD = "org.punksearch.crawler.dump.status.period"; | |
| 54 | 56 | |
| 55 | 57 | } |