Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.
Comment: Update properties table based on recent Nutch 1.x (master) and 2.4

...

This list of Nutch configuration properties is intended for development. It includes deprecated properties and properties used only "internally". The list is generated from nutch-default.xml and Java sources.

Legend:
*Def. *

...

Def: defined in nutch-default.xml

...

*Used *

indent

 read or set from Java code
 

*Temp. *

indent

 temporarily used to pass settings (eg, from command-line arguments) to map or reduce jobs
 

*Depr. *

indent

 deprecated
 

*(owr.) *

...

Used: read or set from Java code

Temp: temporarily used to pass settings (eg, from command-line arguments) to map or reduce jobs

Depr.: deprecated

(owr.): some properties are defined in nutch-default.xml

...

(and

...

may

...

be

...

set

...

in

...

nutch-site.xml)

...

but

...

are

...

overwritten programmatically (tests

...

and

...

benchmarks

...

are

...

excluded)

...

, eg. via a command-line argument in some Nutch tools


*(test) *

indent

 overwritten only in tests and benchmarks
 

 

 

Trunk

 

 

 

 

2.x

 

Property

Def.


1.X (master Branch)

2.x (deprecated codebase)

Property

Def.

Used

Temp.

Depr.

Def.

Used

Temp.

Depr.

CrawlDBScanner
anchorIndexingFilter.
regex

CrawlDBScanner.status

deduplicateXX

 

 

 

 

 



anchorIndexingFilter.deduplicate
XX

 

 

 

 

 



 
any23.content_typesXX

(test)







any23.extractorsXX
(test)

 







arc.url.version-X

 

 

 

 

 

 







batch.proxy.port



-X

content.server.port-X

-X

 



cosine.goldstandard.fileX
 
X

 







crawl.datum.processor.overdue.time.limit-X
 






crawl.gen.delayXX
 


X

 

X

X

 



crawldb.inject.filter.normalize.all-X
 






crawldb.url.filtersXX

 

 

 

(owr.)


X

 


crawldb.url.normalizersX

 

 

 

 

 

 

X(owr.)




crawldb.url.normalizers.scope

X

 

 

 

 

 

 

-X





creativecommons.exclude.unlicensed-X
 


-

 

X

 

 



db.
default.
fetch.interval.defaultXX
 

NUTCH-1409



XX

 

NUTCH-1409



db.fetch.interval.
default
maxXX

(test)

 



XX

 

 



db.fetch.
interval
retry.maxXX

 

 



XX

 

 



db.fetch.
retry.max

X

X

 

 

X

X

 

 

db.fetch.
schedule.adaptive.dec_rateXX
 

 



XX

 

 



db.fetch.schedule.adaptive.inc_rateXX

 

 



XX
 

 



db.fetch.schedule.adaptive.max_intervalXX

 

 


XX

 

 



db.fetch.schedule.adaptive.min_intervalXX

 

 



XX

 

 



db.fetch.schedule.adaptive.sync_deltaXX

 

 



XX

 

 

 


db.fetch.schedule.adaptive.sync_delta_rateX

X

-

 



X

X

 

 

-

db.
db.
fetch.schedule.classXX

(test)

 



XX

 

 



db.fetch.schedule.mime.fileXX

 

 

 

 

 







db.ignore.also.redirectsXX





db.ignore.external.exemptions.fileX-
 






db.ignore.external.linksXX

X

 

X

db.ignore.external.links.mode
 
XX
 

 







db.ignore.internal.linksXX

 

 

 


X

 

-

db.injector.overwriteXX

 

 

 

 

 

 

db.
(owr.)




db.injector.updateXX(
test)

 

 

 

 

 
owr.)




db.max.anchor.length



X

X

 

 

X

 

 

-

db.max.
fetch
outlink.
interval
lengthX

 

NUTCH-1409
X

 

NUTCH-1409







db.max.
inlinks
outlinks.per.pageXX
 


X

 

X

 

 


db.
max
parsemeta.
outlinks
to.
per.page
crawldbXX

 

 

X

X

 

 

db.parsemeta.to.crawldb

X

X

 

 

X

 

 



X-

db.preserve.backupXX

 

 

 

 

 

 







db.reader.stats.sort-XX
 

-XX

 


db.reader.topn-XX

 

 

 

 

 






db.reader.topn.min-XX
 

 

 

 

 






db.score.count.filteredXX

 

 



XX
 

 



db.score.injectedXX

 

 



XX

 

 



db.score.link.externalXX

 

 


XX

 

 


db.score.link.internalXX
 

 



XX

 

 



db.signature.classXX

 

 



XX
 

 

 


db.signature.text_profile.min_token_lenX

X

-

 



X
X
-

 

 



db.signature.text_profile.quant_rateX-

X

 

-

db.stats.score.quantiles
 
XX

 

 






db.update.additions.allowedXX

 

 



XX

 

 

db.


db.update.max.inlinksXX
 

 



XX

 

 



db.update.purge.404XX

 

 

 

 

 

 







db.update.purge.orphansXX





dc.language-X

 

 

 

 

 

 







domain.statistics.mode-XX
 

-XX
 

elastic.cluster



X-

elastic.
index
host
 




X
 
-
 


elastic.index

 





X
 
-

 



elastic.max.bulk.docs

 

 

 

 

X

 

 





X-

elastic.max.bulk.size

 

 

 

 

X

 





X-

elastic.port



X-

elasticsearch.conf



-X
 


encodingdetector.charset.min.confidenceX
 
X
 


XX

 



exchanges.exchanges.fileXX
 






fail.on.job.failure
 




-

 

 

X

fetcher.bandwidth.targetX
 
X

 

 

fetcher.exit






fetcher.bandwidth.target.check.everyNSecsX

 

 

 

 

 

 

X





fetcher.filter.urlsXX





fetcher.follow.outlinks.depthXX

 

 

 

 

 

 







fetcher.follow.outlinks.depth.divisorX

X

 

 

 

 

 

 

-





fetcher.follow.outlinks.ignore.externalX

X

 

 

 

 

 

 

-





fetcher.follow.outlinks.num.linksX

X

 

 

 

 

 

-





fetcher.job.resume



-X

fetcher.job.sitemap



-X
 

 

 

 

 

X

 

 



fetcher.job.
resume
sitemap.detect



-X

fetcher.max.crawl.delayXX

 

 



XX
 

 



fetcher.max.exceptions.per.queueX-

X

 

-

fetcher.maxNum.threadsX
 
X

 







fetcher.min.crawl.delayXX
 






fetcher.
parse
normalize.urlsXX

(test)







fetcher.parseXX
 


XX

 



fetcher.publisherXX
 






fetcher.queue.depth.multiplierXX

 

 



XX

 

 

fetcher.


fetcher.queue.modeXX
 

 



XX

 

 



fetcher.queue.use.host.settings
 




X

 

 

X

fetcher.redirect.dedupcache.secondsX
 
X





fetcher.redirect.dedupcache.sizeX
 
X

 







fetcher.server.delayXX

 

 


XX
 

 



fetcher.server.min.delayXX

 



XX

fetcher.signature
 
XX
 

 







fetcher.store.contentXX

 



XX

fetcher.store.robotstxt
 
XX

 

 







fetcher.threads.fetchXX(owr.)
 

fetcher.threads.per.host

 

 

 

NUTCH-1409

 

 

 

NUTCH-1409


XX

 

 



fetcher.threads.per.host.by.ip

 

 

 

 

X

 





-X
 


fetcher.threads.per.queueXX

 

 



XX

 

 



fetcher.threads.timeout.divisorXX

 

 

 

 

 

 







fetcher.throughput.threshold.check.afterXX(owr.)
 

X
X
-

 

 



fetcher.throughput.threshold.pagesXX

 

 

X

X

 

 



X-

fetcher.throughput.threshold.retriesXX

 

 

 

 

 

 






fetcher.throughput.threshold.sequence

 

 

 

 





X
X
-

 

 



fetcher.timelimit-XX
 

-XX

 


fetcher.timelimit.minsXX

 

 


XX

 

 



fetcher.verbose



X

X

 

 

X

 

 

 
-

file.content.ignoredX

 

-

X
 
-

 



file.content.limitXX(
test
owr.)
  file

XX

(test)

 



file.crawl.parentXX

 



XX

file.crawl.redirect_noncanonical
  
X-

X

 

-

free.generator.filter-X

 

 

 

 

 

 







free.generator.normalize-X

 

 

 

 

 

 







ftp.content.limitXX

 

 



XX

 

 



ftp.follow.talkXX

 

 



XX
 

 



ftp.keep.connectionXX

 

 

ftp


XX

 

 



ftp.passwordXX
 

 



XX

 

 



ftp.server.timeoutXX

 

 



XX

 

 


ftp.timeoutXX

 

 


XX
 

 



ftp.usernameXX

 

 



XX

 

 



generate.batch.id
 




-
 
X
 


generate.count
 




-X

 

 



generate.count.modeXX

 

 



XX
 

 



generate.curTime-X
 


-

 

X

 

 


generate.
filter
expr-X

 







generate.fetch.delay.exprXX





generate.filter-X

-
 
X

 



generate.hostdbXX
 






generate.max.countXX

 



XX

generate.max.count.expr
 
XX

 

 






generate.max.distance
 

 

 

 





XX

 

 



generate.max.num.segments-X

 

 

 

 

 

 







generate.
max
min.
per.host
intervalXX

 

NUTCH-1409

 

 

 

NUTCH-1409

generate.max.per.host.by.ip

X

 

NUTCH-1409

 

 

 

NUTCH-1409







generate.min.scoreXX

XX

generate.normalise-X

-X

generate.partition.seed



-X

generate.restrict.statusXX





generate.sitemap



-X

generate.topN-X

-X

generate.update.crawldb
generate.min.interval
XX
 

generate.normalise

X

 

 

X

 

 

generate.partition.seed

 

 

 

 

X

 

 

generate.restrict.status

X

 

 

 

 

 

 

generate.topN

X

 

 

X

 

 

generate.update.crawldb

X

X

 

 

X

X

 

 



X

 

 

 

 

 

generate.min.score

X

X

 

 

X

 

 

X

gora.buffer.read.limit



X-

gora.buffer.write.limit



X-

hbase.indexer.commit.size



X-

hbase.indexer.mapping.file



X-

hbase.indexer.zookeeper.property.clientPort



X-

hbase.indexer.zookeeper.quorum



X-

headingsX-





headings.multivaluedXX





hostdb.check.failedXX





hostdb.check.knownXX





hostdb.check.newXX





hostdb.concurrency.level
 




-

 

 

X

hostdb.crawldatum.processorsX
 
X

 

 







hostdb.
lru
dump.
size
field.header
 
-

 

 

 

X

 

 

htmlparsefilter.order

X

X

 

 

X

X

 

 

http.accept

X

X

 

 

X

X

 

 

http.accept.language

X

X

 

 

X

X

 

 

http.agent.description

X

X

 

 

X

X

 

 

http.agent.email

X

X

 

 

X

X

 

 

http.agent.host

X

X

 

 

X

X

 

 

http.agent.name

X

X

(test)

 

X

X

(test)

 

http.agent.url

X

X

 

 

X

X

 

 

http.agent.version

X

X

 

 

X

X

 

 

http.auth.file

X

X

 

 

X

X

 

 

http.auth.verbose

X

 

 

X

 

 

http.content.limit

X

X

 

 

X

X

 

 

http.max.delays

X

 

 

X

 

 

http.proxy.host

X

X

(test)

 

X

X

(test)

 

http.proxy.password

X

X

 

 

X

X

 

 

http.proxy.port

X

X

(test)

 

X

X

(test)

 

http.proxy.realm

X

X

 

 

X

X

 

 

http.proxy.username

X

X

 

 

X

X

 

 

http.redirect.max

X

X

 

 

 

 

 

 

http.robots.403.allow

X

X

 

 

X

X

 

 

http.robots.agents

X

X

(test)

 

X

X

(test)

 

http.timeout

X

X

 

 

X

X

 

 

http.useHttp11

X

X

 

 

X

X

 

 

http.verbose

X

X

 

 

X

X

 

 

index.content.md

X

X

 

 

 

 

 

 

index.db.md

X

X

 

 

 

 

 

 

index.parse.md

X

X

(test)

 

 

 

 

 

index.replace.regexp

X

X

 

 

 

 

 

 

index.static

X

X

 

 

 

 

 

 

indexer.add.domain

X

X

 

 

 

 

 

 

indexer.delete

X

 

 

 

 

 

 

indexer.delete.robots.noindex

X

 

 

 

 

 

 

indexer.max.content.length

X

X

 

 

 

 

 

 

indexer.max.title.length

X

X

 

 

X

X

(test)

 

indexer.score.power

X

X

 

 

X

X

 

 

indexer.skip.notmodified

X

X

 

 

 

 

 

 

indexer.url.filters

X

X

 

 

 

X

 

indexer.url.normalizers

X

 

 

 

 

 

 

indexer.writer.classes

X

X

 

X

X

 

indexingfilter.order

X

X

 

 

X

X

 

 

injector.current.time

X

X

 

X

X

 

lang.analyze.max.length

X

X

 

 

X

 

 

lang.extraction.policy

X

X

 

 

X

X

 

 

lang.identification.only.certain

X

X

 

 

X

X

 

 

lang.ngram.max.length

 

 

 

 

X

 

 

lang.ngram.min.length

 

 

 

 

X

 

 

link.analyze.damping.factor

X

X

 

 

 

 

 

 

link.analyze.initial.score

X

X

 

 

 

 

 

 

link.analyze.iteration

X

X

 

 

 

 

 

link.analyze.normalize.score

X

 

 

X

 

 

link.analyze.num.iterations

X

X

 

 

 

 

 

 

link.analyze.rank.one

X

X

 

 

 

 

 

X





hostdb.dump.homepages-X





hostdb.dump.hostnames-X





hostdb.filter.expression-X





hostdb.force.checkXX





hostdb.lru.size



-X

hostdb.num.resolvers.threadsXX





hostdb.numeric.fieldsXX





hostdb.percentilesXX





hostdb.purge.failed.hosts.thresholdXX





hostdb.reading.crawldb-XX




hostdb.recheck.intervalXX





hostdb.string.fieldsXX





hostdb.url.filterXX





hostdb.url.normalizeXX





htmlparsefilter.orderXX

XX

htmlunit.enable.cssXX





htmlunit.enable.javascriptXX





htmlunit.javascript.timeoutXX





http.acceptXX

XX

http.accept.charsetXX

XX

http.accept.languageXX

XX

http.agent.descriptionXX

XX

http.agent.emailXX

XX

http.agent.hostXX

XX

http.agent.host.cookie.fileXX





http.agent.nameXX(owr.)
XX

http.agent.rotateXX

XX

http.agent.rotate.fileXX

XX

http.agent.urlXX

XX

http.agent.versionXX

XX

http.auth.fileXX

XX

http.auth.verbose-X

-X

http.content.limitXX(owr.)
XX

http.content.truncated-X





http.content.truncated.reason-X





http.enable.cookie.headerXX





http.enable.if.modified.since.headerXX





http.log.exceptions.suppress.stackXX





http.max.delays



X-

http.partial.truncatedXX





http.proxy.exception.listXX





http.proxy.hostXX

XX

http.proxy.passwordXX

XX

http.proxy.portXX

XX

http.proxy.realmXX

XX

http.proxy.typeXX





http.proxy.usernameXX

XX

http.redirect.maxXX





http.redirect.max.exceeded.skipXX





http.robot.rules.whitelistXX





http.robots.403.allowXX

XX

http.robots.agentsXX(owr.)
XX

http.store.responsetimeXX

XX

http.time.limitXX





http.timeoutXX

XX

http.tls.certificates.checkXX





http.tls.supported.cipher.suites-X

-X

http.tls.supported.protocols-X

-X

http.useHttp11XX

XX

http.useHttp2XX





http.verbose



XX

index.content.mdXX





index.db.mdXX





index.geoip.licensekeyXX





index.geoip.usageXX





index.geoip.useridXX





index.jexl.filterXX





index.links.hosts.onlyX-





index.links.inlinks.host.ignoreX-





index.links.outlinks.host.ignoreX-





index.metadata



XX

index.metadata.multivalued.fields-X





index.metadata.separatorXX





index.parse.mdXX





index.replace.regexpXX





index.staticXX





index.static.fieldsepXX





index.static.keysepXX





index.static.valuesepXX





indexer.add.domainXX





indexer.additional.params-X





indexer.binary.base64-X





indexer.delete-X





indexer.delete.robots.noindexXX





indexer.delete.skipped.by.indexingfilterXX





indexer.indexwriters.fileXX





indexer.max.content.lengthXX





indexer.max.title.lengthXX

XX

indexer.nocommit-X





indexer.score.powerXX

XX

indexer.skip.notmodifiedXX





indexer.url.filters-XX


X
indexer.url.normalizers-X





indexingfilter.orderXX

XX

injector.current.time-XX
-XX
interactiveselenium.handlersXX





io.file.buffer.size-X





io.serializationsX-

X-

jsoup.extractor.property.file



XX

lang.analyze.max.lengthXX

X-

lang.extraction.policyXX

XX

lang.identification.only.certainXX

XX

lang.index.languagesXX





lang.ngram.max.length



X-

lang.ngram.min.length



X-

libselenium.page.load.delay-X





link.analyze.damping.factorXX





link.analyze.initial.scoreXX





link.analyze.iteration-XX




link.analyze.normalize.score-X

-X

link.analyze.num.iterationsXX





link.analyze.rank.one-XX




link.delete.goneXX





link.ignore.internal.domainXX





link.ignore.internal.hostXX





link.ignore.limit.domainXX





link.ignore.limit.pageXX





link.score.updater.clear.scoreXX





linkdb.ignore.external.linksXX





linkdb.ignore.internal.linksXX





linkdb.max.anchor.lengthXX





linkdb.max.inlinksXX





linkdb.regex-XX




linkdb.url.filters-XX


X
linkdb.url.normalizer-X





linkdb.url.normalizer.scope-X





metatag.description-X





metatag.keyword-X





metatag.keywords-X





metatags.namesXX

XX

mime.type.magicXX

XX

mime.types.fileXX

XX

mimetype.filter.fileXX





moreIndexingFilter.indexMimeTypePartsXX

XX

moreIndexingFilter.mapMimeTypesXX





moreIndexingFilter.mapMimeTypes.fieldXX





nutch.conf.uuid-X

-X

nutch.fetch.time-X





org.apache.nutch.webui



-X

page.load.delayXX





parse.filter.urlsXX(owr.)




parse.job.force



-X

parse.job.resume



-X

parse.normalize.urlsXX(owr.)




parse.plugin.fileXX

XX

parse.sitemap



-X

parsefilter.naivebayes.trainfileXX





parsefilter.naivebayes.wordlistXX





parsefilter.regex.file-X





parsefilter.regex.rules-X





parser.caching.forbidden.policyXX

XX

parser.character.encoding.defaultXX

XX

parser.html.form.use_actionXX

XX

parser.html.implXX

XX

parser.html.line.separatorsXX





parser.html.outlinks.htmlnode_metadata_nameXX





parser.html.outlinks.ignore_tagsXX

XX

parser.html.outlinks.max.target.length



XX

parser.skip.truncatedXX

XX

parser.store.textXX





parser.timeoutXX

XX

partition.url.modeXX

XX

partition.url.seed-XX
-X

plugin.auto-activationXX

XX

plugin.excludesXX

XX

plugin.foldersXX

XX

plugin.includesXX

XX

preferred.schema.name





X
publisher.orderX-





rabbitmq.publisher.bindingXX





rabbitmq.publisher.binding.argumentsXX





rabbitmq.publisher.exchange.nameXX





rabbitmq.publisher.exchange.optionsXX





rabbitmq.publisher.headers.staticXX





rabbitmq.publisher.queue.nameXX





rabbitmq.publisher.queue.optionsXX





rabbitmq.publisher.routingkeyXX





rabbitmq.publisher.server.uriXX





restapi.auth



XX

restapi.auth.ssl.keypass



XX

restapi.auth.ssl.storepass



XX

restapi.auth.ssl.storepath



XX

restapi.auth.users



XX

scoring.content.mdXX





scoring.db.mdXX





scoring.depth.maxXX





scoring.filter.orderX-

XX

scoring.orphan.mark.gone.afterXX





scoring.orphan.mark.orphan.afterXX





scoring.parse.mdXX





scoring.similarity.modelXX





scoring.similarity.ngramsXX





scoring.similarity.stopword.fileXX





screenshot.locationXX





segment.dump.dir-X





segment.merger.filter-XX




segment.merger.normalizer-XX




segment.merger.segmentName-XX




segment.merger.slice-XX




segment.proxy.port-X





segment.reader.content.recodeXX(owr.)




selenium.driverXX





selenium.enable.headlessXX





selenium.firefox.allowed.hostsX-





selenium.firefox.binary.timeoutX-





selenium.firefox.enable.flashX-





selenium.firefox.load.imageX-





selenium.firefox.load.stylesheetX-





selenium.grid.binaryXX





selenium.grid.driverXX





selenium.hub.hostXX





selenium.hub.pathXX





selenium.hub.portXX





selenium.hub.protocolXX





sftp.password



-X

sftp.port



-X

sftp.server



-X

sftp.user



-X

sitemap.content.limit



X-

sitemap.parser.timeout



XX

sitemap.redir.maxXX





sitemap.size.maxXX





sitemap.strict.parsingXX





sitemap.url.default.sitemap.xmlXX





sitemap.url.filterXX





sitemap.url.normalizeXX





sitemap.url.overwrite.existingXX





solr.auth



XX

solr.auth.password



-X

solr.auth.username



-X

solr.commit.index



XX

link.delete.gone

X

X

 

 

 

 

 

 

link.ignore.internal.domain

X

X

 

 

 

 

 

 

link.ignore.internal.host

X

X

 

 

 

 

 

 

link.ignore.limit.domain

X

X

 

 

 

 

 

 

link.ignore.limit.page

X

X

 

 

 

 

 

 

link.loops.depth

X

X

 

 

 

 

 

 

link.score.updater.clear.score

X

X

 

 

 

 

 

 

linkdb.url.filters

X

X

 

 

 

X

 

linkdb.url.normalizer

X

 

 

 

 

 

 

linkdb.url.normalizer.scope

X

 

 

 

 

 

 

metatag.description

X

 

 

 

 

 

 

metatag.keywords

X

 

 

 

 

 

 

metatags.names

X

X

(test)

 

 

 

 

 

mime.type.magic

X

X

 

 

X

X

 

 

mime.types.file

X

X

 

 

X

X

 

 

moreIndexingFilter.indexMimeTypeParts

X

X

(test)

 

X

X

(test)

 

moreIndexingFilter.mapMimeTypes

X

X

 

 

 

 

 

 

nutch.conf.uuid

X

 

 

X

 

 

parse.filter.urls

X

X

(owr.)

 

 

 

 

 

parse.job.force

 

 

 

 

X

 

 

parse.job.resume

 

 

 

 

X

 

 

parse.normalize.urls

X

X

(owr.)

 

 

 

 

 

parse.plugin.file

X

X

(test)

 

X

X

(test)

 

parser.caching.forbidden.policy

X

X

 

 

X

X

 

 

parser.character.encoding.default

X

X

 

 

X

X

 

 

parser.fix.embeddedparams

X

 

 

 

 

 

 

parser.html.form.use_action

X

X

(test)

 

X

X

(test)

 

parser.html.impl

X

X

 

 

X

X

 

 

parser.html.outlinks.ignore_tags

X

X

 

 

X

X

 

 

parser.skip.truncated

X

X

 

 

X

X

 

 

parser.timeout

X

X

 

 

X

X

 

 

partition.url.mode

X

X

 

 

X

X

 

 

partition.url.seed

X

X

 

X

 

 

plugin.auto-activation

X

X

 

 

X

X

 

 

plugin.excludes

X

X

 

 

X

X

 

 

plugin.folders

X

X

 

 

X

X

 

 

plugin.includes

X

X

(test)

 

X

X

(test)

 

schema.prefix

 

 

 

 

 

 

X

 

scoring.filter.order

X

X

 

 

X

X

 

 

segment.dump.dir

X

 

 

 

 

 

 

segment.merger.filter

X

X

 

 

 

 

 

segment.merger.normalizer

X

X

 

 

 

 

 

segment.merger.segmentName

X

X

 

 

 

 

 

segment.merger.slice

X

X

 

 

 

 

 

segment.proxy.port

X

 

 

X

 

 

segment.reader.co

X

X

 

 

 

 

 

segment.reader.fe

X

X

 

 

 

 

 

segment.reader.ge

X

X

 

 

 

 

 

segment.reader.pa

X

X

 

 

 

 

 

segment.reader.pd

X

X

 

 

 

 

 

segment.reader.pt

X

X

 

 

 

 

 

sftp.password

 

 

 

 

X

 

 

sftp.port

 

 

 

 

X

 

 

sftp.server

 

 

 

 

X

 

 

sftp.user

 

 

 

 

X

 

 

solr.auth

X

X

 

 

 

 

 

 

solr.auth.password

X

 

 

 

 

 

 

solr.auth.username

X

 

 

 

 

 

 

solr.commit.index

X

X

 

 

X

X

 

 


solr.commit.size



XX

 

 



solr.mapping.file



XX

 

 



solr.
mapping
server.
file
url
X




-X

 



storage.crawl.id



X
 
X

storage.data.store.class



X

 

 

-

storage.schema.host



X
solr.params
X

 

 

 

 

 

 

solr.server.url

X

 

 

X

 

 

storage.crawl.id

 

 

 

 

X

X

 

 

storage.data.store.class

 

 

 

 

X

X

(test)

 

storage.schema.host

 

 

 

 

X

X

 

 

storage.schema.webpage

 

 

 

 

X

X

 

 

subcollection.default.field

X

 

 

 

 

 

 

subcollection.default.fieldname

X

 

 

 

 

 

 

subcollections.config

X

 

 

X

 

 

subcollections.xml

X

 

 

X

 

 

tika.config.file

X

 

 

 

 

 

 



storage.schema.webpage



XX

store.http.headersXX





store.http.requestXX





store.ip.addressXX

XX

subcollection.case.insensitiveXX





subcollection.default.fieldnameXX





subcollection.metadata.source-X





subcollections.config-X

-X

subcollections.xml-X

-X

take.screenshotXX





tika.boilerpipe



XX

tika.boilerpipe.extractor



XX

tika.config.fileXX





tika.extractorXX





tika.extractor.boilerpipe.algorithmXX





tika.extractor.boilerpipe.mime.typesXX





tika.htmlmapper.classnameXX

XX

tika.parse.embeddedXX





tika.uppercase.element.namesXX





urlfilter.automaton.fileXX

 

 



XX
 

 



urlfilter.automaton.rules-X
 


-

 

X

 

 


urlfilter.domain.file

X

X

 

 
XX
 


X
 
X

urlfilter.domain.rules-X
 


-

 

X

 

 



urlfilter.
domainblacklist

urlfilter.domainblacklist.rules

X

 

 

 

 

 

 

domaindenylist.file-X

 

 

 

 

 

 







urlfilter.domaindenylist.rules-X





urlfilter.fast.fileXX





urlfilter.orderXX

 

 



XX
 

 



urlfilter.prefix.fileXX

 

 



XX

 

 


urlfilter.prefix.rules
X
-

 

 

X
 


-
 
X

urlfilter.regex.fileXX

 

 


XX

 

 



urlfilter.regex.rules-X
 


-

 

X

 

 


urlfilter.suffix.fileXX

(test)

 


XX

(test)

 



urlfilter.suffix.rules-X

 



-X

urlfilter.tld.length
 




X
 
X

 



urlmeta.tags
X
XX





urlnormalizer.basic.host.idnX-





urlnormalizer.basic.host.trim-trailing-dotX

 

 

 

 

 

 

-





urlnormalizer.hosts.file

X

 

 

 

 

 

 

-X





urlnormalizer.hosts.rules-X

 

 

 

 

 

 







urlnormalizer.loop.countXX

X
 
X
 


urlnormalizer.orderXX
 


X
 
X

urlnormalizer.
order
protocols.fileXX





urlnormalizer.protocols.rulesX

(test)

X





urlnormalizer.regex.file
 
XX
(test)


X
 
X

urlnormalizer.regex.
file
rules-X

-X

 



urlnormalizer.slashes.file-
 
X

X

 

 

urlnormalizer.regex.rules

X

 

 

X

 







urlnormalizer.slashes.rules-X





warc.exporter.only.successful.responses-X





warc.file.size.max-X





webdriver.chrome.driverX-
 






webgraph.url.filters-XX

 

 

 




X

 


webgraph.url.normalizers

X

 
 
-

 

 

 

X





webgui.auth.users



XX
 


webtable.dump.content

 

 

 

 

X

 

 





-X

webtable.dump.headers

 

 

 

 

X

 

 





-X

webtable.dump.links

 

 

 

 





-X

 

 



webtable.dump.text

 

 

 

 





-X

 

 



webtable.url.regex

 

 

 

 

X

 





-X
 


back to FrontPage