This Confluence has been LDAP enabled, if you are an ASF Committer, please use your LDAP Credentials to login. Any problems file an INFRA jira ticket please.

Page tree
Skip to end of metadata
Go to start of metadata

Integrating Tika With ExtractingRequestHandler

0. Because the online apache repositories do not have Tika-1.8-SNAPSHOT for now, we have to use local maven repository.

1. Checkout tike-trunk:

$ svn co tika-trunk

2. Build Tika

$ cd tika-trunk

$ mvn install

3. Download tika-parsers dependencies:

$ cd tika-parsers

$ mvn dependency:copy-dependencies

4. Checkout lucene-solr-4-10

$ svn checkout lucene_solr_4_10

5. Modified the lucene_solr_4_10/lucene/ivy-setting.xml by uncommenting line 45-52 and line 56:

Line 45-52:

    <filesystem name="local-maven-2" m2compatible="true" local="true">


            pattern="${local-maven2-dir}/[organisation]/[module]/[revision]/[module]-[revision].[ext]    " />


           pattern="${local-maven2-dir}/[organisation]/[module]/[revision]/[module]-[revision].pom"     />


Line 56:

   <resolver ref="local-maven-2" />

6. Replace the lucene_solr_4_10/solr/contrib/extraction/ivy.xml to the following ivy.xml:


   Licensed to the Apache Software Foundation (ASF) under one

   or more contributor license agreements.  See the NOTICE file

   distributed with this work for additional information

   regarding copyright ownership.  The ASF licenses this file

   to you under the Apache License, Version 2.0 (the

   "License"); you may not use this file except in compliance

   with the License.  You may obtain a copy of the License at

   Unless required by applicable law or agreed to in writing,

   software distributed under the License is distributed on an


   KIND, either express or implied.  See the License for the

   specific language governing permissions and limitations

   under the License.


<ivy-module version="2.0">

  <info organisation="org.apache.solr" module="extraction"/>

  <configurations defaultconfmapping="compile->master;test->master">

    <conf name="compile" transitive="false"/>

    <conf name="test" transitive="false"/>



    <!-- Tika JARs -->

    <dependency org="org.apache.tika" name="tika-core" rev="1.8-SNAPSHOT" conf="compile"/>

    <dependency org="org.apache.tika" name="tika-parsers" rev="1.8-SNAPSHOT" conf="compile"/>

    <dependency org="org.apache.tika" name="tika-xmp" rev="1.8-SNAPSHOT" conf="compile"/>

    <!-- Tika dependencies - see -->

    <!-- When upgrading Tika, upgrade dependencies versions and add any new ones

         (except slf4j-api, commons-codec, commons-logging, commons-httpclient, geronimo-stax-api_1.0_spec, jcip-annotations, xml-apis, asm)

         WARNING: Don't add netcdf / unidataCommon (partially LGPL code) -->

    <dependency org="org.gagravarr" name="vorbis-java-tika" rev="0.6" conf="compile"/>

    <dependency org="org.gagravarr" name="vorbis-java-core" rev="0.6" conf="compile"/>

    <dependency org="org.apache.james" name="apache-mime4j-core" rev="0.7.2" conf="compile"/>

    <dependency org="org.apache.james" name="apache-mime4j-dom" rev="0.7.2" conf="compile"/>

    <dependency org="org.apache.commons" name="commons-compress" rev="1.9" conf="compile"/>

    <dependency org="org.apache.pdfbox" name="pdfbox" rev="1.8.8" conf="compile"/>

    <dependency org="org.apache.pdfbox" name="fontbox" rev="1.8.8" conf="compile"/>

    <dependency org="org.apache.pdfbox" name="jempbox" rev="1.8.8" conf="compile"/>

    <dependency org="org.bouncycastle" name="bcmail-jdk15" rev="1.45" conf="compile"/>

    <dependency org="org.bouncycastle" name="bcprov-jdk15" rev="1.45" conf="compile"/>

    <dependency org="org.apache.poi" name="poi" rev="3.11" conf="compile"/>

    <dependency org="org.apache.poi" name="poi-scratchpad" rev="3.11" conf="compile"/>

    <dependency org="org.apache.poi" name="poi-ooxml" rev="3.11" conf="compile"/>

    <dependency org="org.apache.poi" name="poi-ooxml-schemas" rev="3.11" conf="compile"/>

    <dependency org="org.apache.xmlbeans" name="xmlbeans" rev="2.6.0" conf="compile"/>

    <dependency org="dom4j" name="dom4j" rev="${/dom4j/dom4j}" conf="compile"/>

    <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2.1" conf="compile"/>

    <dependency org="com.googlecode.mp4parser" name="isoparser" rev="1.0.2" conf="compile"/>

    <dependency org="org.aspectj" name="aspectjrt" rev="1.8.0" conf="compile"/>

    <dependency org="com.drewnoakes" name="metadata-extractor" rev="2.6.2" conf="compile"/>

    <dependency org="de.l3s.boilerpipe" name="boilerpipe" rev="1.1.0" conf="compile"/>

    <dependency org="rome" name="rome" rev="1.0" conf="compile"/>

    <dependency org="jdom" name="jdom" rev="1.0" conf="compile"/>

    <dependency org="com.googlecode.juniversalchardet" name="juniversalchardet" rev="1.0.3" conf="compile"/>

    <dependency org="org.tukaani" name="xz" rev="1.5" conf="compile"/>

    <dependency org="com.adobe.xmp" name="xmpcore" rev="5.1.2" conf="compile"/>

    <dependency org="com.uwyn" name="jhighlight" rev="1.0" conf="compile"/>

    <!-- Other ExtractingRequestHandler dependencies -->

    <dependency org="" name="icu4j" rev="${/}" conf="compile"/>

    <dependency org="xerces" name="xercesImpl" rev="${/xerces/xercesImpl}" conf="compile"/>

    <dependency org="org.slf4j" name="jcl-over-slf4j" rev="${/org.slf4j/jcl-over-slf4j}" conf="test"/>

    <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>



7. Compile solr

Change the working directory to lucene_solr_4_10/solr/

$ ant compile

8. Generate new sha1 files for the jars

$ ant jar-checksums

9. Done. Enjoy.

  • No labels