forked from LAW-Unimi/BUbiNG
-
Notifications
You must be signed in to change notification settings - Fork 0
/
folklore.properties
32 lines (32 loc) · 1.59 KB
/
folklore.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
rootDir=crawl_folklore
maxUrlsPerSchemeAuthority=1000
parsingThreads=64
dnsThreads=50
fetchingThreads=512
fetchFilter=( SchemeEquals(http) or SchemeEquals(https) ) and HostEndsWithOneOf(folklore.org)
scheduleFilter=( SchemeEquals(http) or SchemeEquals(https) ) and HostEndsWithOneOf(.ad\,.org\,.al\,.at\,.be\,.bg\,.ch\,.cz\,.de\,.dk\,.ee\,.es\,.eu\,.fi\,.fo\,.fr\,.gb\,.gr\,.hr\,.hu\,.ie\,.im\,.is\,.it\,.li\,.lt\,.lu\,.lv\,.mc\,.md\,.me\,.nl\,.no\,.pl\,.pt\,.ro\,.se\,.si\,.sk\,.sm\,.uk\,.va) and not PathEndsWithOneOf(.axd\,.xls\,.rar\,.sflb\,.tmb\,.pdf\,.js\,.swf\,.rss\,.kml\,.m4v\,.tif\,.avi\,.iso\,.mov\,.ppt\,.bib\,.docx\,.css\,.fits\,.png\,.gif\,.jpg\,.jpeg\,.ico\,.doc\,.wmv\,.mp3\,.mp4\,.aac\,.ogg\,.wma\,.gz\,.bz2\,.Z\,.z\,.zip) and URLShorterThan(2048) and DuplicateSegmentsLessThan(3)
followFilter=( SchemeEquals(http) or SchemeEquals(https) ) and HostEndsWithOneOf(folklore.org)
parseFilter=HostEquals(www.folklore.org)
storeFilter=( ContentTypeStartsWith(text/) or PathEndsWithOneOf(.html\,.htm\,.txt\,.py\,.php) ) and not IsProbablyBinary()
schemeAuthorityDelay=10s
ipDelay=2s
maxUrls=1M
bloomFilterPrecision=1E-8
seed=http://www.folklore.org/
socketTimeout=60s
connectionTimeout=60s
fetchDataBufferByteSize=1200K
cookiePolicy=compatibility
cookieMaxByteSize=12000
userAgent=BUbiNG (+http://law.di.unimi.it/BUbiNG.html)
userAgentFrom=nurul@ferdo.us
robotsExpiration=1h
responseBodyMaxByteSize=2M
digestAlgorithm=MurmurHash3
startPaused=false
workbenchMaxByteSize=512Mi
urlCacheMaxByteSize=1Gi
sieveSize=64Mi
parserSpec=HTMLParser(MurmurHash3)
keepAliveTime=60
storeClass=it.unimi.di.law.bubing.store.ESStore