komoot · hbruch · Jan 29, 2018 · Jan 31, 2018
diff --git a/es/config/analysis/hyphenation_patterns.xml b/es/config/analysis/hyphenation_patterns.xml
@@ -0,0 +1,98 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE hyphenation-info SYSTEM "hyphenation.dtd">
+<hyphenation-info>
+<hyphen-char value="-"/>
+<hyphen-min before="2" after="2"/>
+<classes>
+@!$%^&amp;*()_-+=~`{[}]:;'|&lt;,.&gt;?/0123455789
+aA
+bB
+cC
+dD
+eE
+fF
+gG
+hH
+iI
+jJ
+kK
+lL
+mM
+nN
+oO
+pP
+qQ
+rR
+sS
+tT
+uU
+vV
+wW
+xX
+yY
+zZ
+��
+��
+��
+��
+�
+</classes>
+
+<exceptions>
+</exceptions>
+
+<!-- 
+Numerals in the following patterns indicate word positions,
+at which a split should occure. 
+The dot represent word start/end markers.
+
+Note: even digit means: avoid split, uneven: consider split, value reflects priority   
+Because of a bug in lucene, only digits between 1-6 should be used.
+(see https://issues.apache.org/jira/browse/LUCENE-8124)
+
+Example:
+For pattern 5str. the hyphenation_decompounder would generate the following tokens:
+Hauptst => Haupts
+Hauptstr => Hauptstr, Haupt, str
+Hauptstra => Hauptstra
+
+This filter assumes to be applied after lowercasing and assci_folding/german_normalization.
+
+-->
+
+<patterns>
+5aue.
+5allee.
+5berg.
+5blick.
+5chaussee.
+5damm.
+5dorf.
+5feld.
+5felde.
+5fleck.
+5flecklein.
+5gasse.
+5garten.
+5gebiet.
+5graben.
+5hain.
+5heide.
+5hoehe.
+5hof.
+5hofe.
+5markt.
+5park.
+5platz.
+5ring.
+5stadt.
+5str.
+5strasse.
+5tal.
+5ufer.
+5wald.
+5weg.
+5werk.
+</patterns>
+
+</hyphenation-info>
diff --git a/es/index_settings.json b/es/index_settings.json
@@ -12,6 +12,7 @@
 					"lowercase",
 					"german_normalization",
 					"asciifolding",
+					"photon_hyphenation_decompounder",
 					"unique"
 				]
 			},
@@ -23,6 +24,7 @@
 				"filter": [
 					"lowercase",
 					"german_normalization",
+					"photon_hyphenation_decompounder",
 					"asciifolding"
 				]
 			},
@@ -34,6 +36,7 @@
 					"word_delimiter",
 					"lowercase",
 					"german_normalization",
+					"photon_hyphenation_decompounder",
 					"asciifolding",
 					"unique"
 				],
@@ -47,6 +50,7 @@
 					"word_delimiter",
 					"lowercase",
 					"german_normalization",
+					"photon_hyphenation_decompounder",
 					"asciifolding",
 					"unique"
 				],
@@ -91,9 +95,14 @@
 				"min": "2",
 				"type": "length"
 			},
-			"preserving_word_delimiter": { 
+			"preserving_word_delimiter": {
 				"type": "word_delimiter",
 				"preserve_original": "true"
+			},
+			"photon_hyphenation_decompounder": {
+				"type" : "photon_hyphenation_decompounder",
+				"hyphenation_patterns_path": "analysis/hyphenation_patterns.xml",
+				"min_word_size": 6
 			}
 		}
 	}

diff --git a/es/modules/photon-es/photon-es.jar b/es/modules/photon-es/photon-es.jar
diff --git a/es/modules/photon-es/plugin-descriptor.properties b/es/modules/photon-es/plugin-descriptor.properties
@@ -0,0 +1,9 @@
+description=Photon analysis plugin for elasticsearch.
+version=5.5.0.1-SNAPSHOT
+name=photon-es
+jvm=true
+java.version=1.8
+classname=de.komoot.photon.es.PhotonPlugin
+elasticsearch.version=5.5.0
+java.version=1.8
+isolated=true
diff --git a/photon-es/pom.xml b/photon-es/pom.xml
@@ -0,0 +1,92 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<name>Plugin: Photon: Elasticsearch Plugin</name>
+	<modelVersion>4.0.0</modelVersion>
+	<groupId>de.systect.photon</groupId>
+	<artifactId>photon-es</artifactId>
+	<version>5.5.0.1-SNAPSHOT</version>
+	<packaging>jar</packaging>
+	<description>Photon plugin for elasticsearch</description>
+	<inceptionYear>2018</inceptionYear>
+	<licenses>
+		<license>
+			<name>The Apache Software License, Version 2.0</name>
+			<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+			<distribution>repo</distribution>
+		</license>
+	</licenses>
+	<properties>
+		<elasticsearch.version>5.5.0</elasticsearch.version>
+		<elasticsearch.plugin.classname>de.komoot.photon.es.PhotonPlugin</elasticsearch.plugin.classname>
+		<maven.compiler.source>1.8</maven.compiler.source>
+		<maven.compiler.target>1.8</maven.compiler.target>
+	</properties>
+	<build>
+		<plugins>
+			<plugin>
+				<artifactId>maven-compiler-plugin</artifactId>
+				<version>3.3</version>
+				<configuration>
+					<source>${maven.compiler.source}</source>
+					<target>${maven.compiler.target}</target>
+					<encoding>UTF-8</encoding>
+				</configuration>
+			</plugin>
+			<plugin>
+				<artifactId>maven-surefire-plugin</artifactId>
+				<version>2.19</version>
+				<configuration>
+					<includes>
+						<include>**/*Tests.java</include>
+					</includes>
+				</configuration>
+			</plugin>
+			<plugin>
+				<artifactId>maven-source-plugin</artifactId>
+				<version>2.4</version>
+				<executions>
+					<execution>
+						<id>attach-sources</id>
+						<goals>
+							<goal>jar</goal>
+						</goals>
+					</execution>
+				</executions>
+			</plugin>
+			<plugin>
+				<artifactId>maven-assembly-plugin</artifactId>
+				<version>2.6</version>
+				<configuration>
+					<appendAssemblyId>false</appendAssemblyId>
+					<outputDirectory>${project.build.directory}/releases/</outputDirectory>
+					<descriptors>
+						<descriptor>${basedir}/src/main/assemblies/plugin.xml</descriptor>
+					</descriptors>
+				</configuration>
+				<executions>
+					<execution>
+						<phase>package</phase>
+						<goals>
+							<goal>single</goal>
+						</goals>
+					</execution>
+				</executions>
+			</plugin>
+		</plugins>
+	</build>
+	<dependencies>
+		<dependency>
+			<groupId>org.elasticsearch</groupId>
+			<artifactId>elasticsearch</artifactId>
+			<version>${elasticsearch.version}</version>
+			<scope>provided</scope>
+		</dependency>
+		<dependency>
+			<groupId>org.apache.lucene</groupId>
+			<artifactId>lucene-test-framework</artifactId>
+			<version>6.6.1</version>
+			<scope>test</scope>
+		</dependency>
+	</dependencies>
+</project>
diff --git a/photon-es/src/main/assemblies/plugin.xml b/photon-es/src/main/assemblies/plugin.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<assembly>
+	<id>plugin</id>
+	<formats>
+		<format>zip</format>
+	</formats>
+	<includeBaseDirectory>false</includeBaseDirectory>
+	<files>
+		<file>
+			<source>${basedir}/src/main/plugin-metadata/plugin-descriptor.properties</source>
+			<outputDirectory>/</outputDirectory>
+			<filtered>true</filtered>
+		</file>
+	</files>
+	<dependencySets>
+		<dependencySet>
+			<outputDirectory>/</outputDirectory>
+			<useProjectArtifact>true</useProjectArtifact>
+			<useTransitiveFiltering>true</useTransitiveFiltering>
+		</dependencySet>
+	</dependencySets>
+</assembly>
diff --git a/photon-es/src/main/java/de/komoot/photon/es/PhotonPlugin.java b/photon-es/src/main/java/de/komoot/photon/es/PhotonPlugin.java
@@ -0,0 +1,41 @@
+package de.komoot.photon.es;
+
+import static java.util.Collections.singletonMap;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.indices.analysis.AnalysisModule;
+import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
+import org.elasticsearch.plugins.AnalysisPlugin;
+import org.elasticsearch.plugins.Plugin;
+
+import de.komoot.photon.es.analysis.HyphenationCompoundWordTokenFilterFactory;
+
+public class PhotonPlugin extends Plugin implements AnalysisPlugin {
+
+    @Override
+    public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
+        return singletonMap("photon_hyphenation_decompounder",
+                        requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new));
+    }
+
+    private static <T> AnalysisModule.AnalysisProvider<T> requriesAnalysisSettings(AnalysisModule.AnalysisProvider<T> provider) {
+        return new AnalysisModule.AnalysisProvider<T>() {
+            @Override
+            public T get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
+                return provider.get(indexSettings, environment, name, settings);
+            }
+
+            @Override
+            public boolean requiresAnalysisSettings() {
+                return true;
+            }
+        };
+    }
+
+}
diff --git a/...es/src/main/java/de/komoot/photon/es/analysis/AbstractCompoundWordTokenFilterFactory.java b/...es/src/main/java/de/komoot/photon/es/analysis/AbstractCompoundWordTokenFilterFactory.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package de.komoot.photon.es.analysis;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.Analysis;
+
+/**
+ * Contains the common configuration settings between subclasses of this class.
+ */
+public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTokenFilterFactory {
+
+    protected final int minWordSize;
+    protected final int minSubwordSize;
+    protected final int maxSubwordSize;
+    protected final boolean onlyLongestMatch;
+    protected final CharArraySet wordList;
+
+    public AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        super(indexSettings, name, settings);
+
+        minWordSize = settings.getAsInt("min_word_size", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
+        minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
+        maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
+        onlyLongestMatch = settings.getAsBoolean("only_longest_match", false);
+        wordList = Analysis.getWordSet(env, settings, "word_list");
+        // PATCH: for the hyphenation_decompounder, the word_list may be null
+        //if (wordList == null) {
+        //    throw new IllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
+        //}
+    }
+}