-
Notifications
You must be signed in to change notification settings - Fork 10
/
FuzzyMatchingJoinExample.scala
42 lines (29 loc) · 1.19 KB
/
FuzzyMatchingJoinExample.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
package com.pb.fuzzy.matching
import org.apache.spark.sql.SparkSession
import org.apache.log4j.Logger
import org.apache.log4j.Level
import com.pb.fuzzy.matching.functions._
import org.apache.spark.sql.functions._
object FuzzyMatchingJoinExample {
def getSparkSession(appName: String): SparkSession = {
Logger.getLogger("org").setLevel(Level.OFF);
Logger.getLogger("akka").setLevel(Level.OFF);
SparkSession
.builder()
.appName(appName)
.master("local")
.getOrCreate()
}
def main(args: Array[String]) {
val sparkSession = getSparkSession("Fuzzy matching")
val df = sparkSession.read.option("header", "true").option("separator", ";").option("inferSchema", "true").csv("src/test/data/movies1.txt")
val dfWithWrongTitles = sparkSession.read.option("header", "true").option("separator", ";").option("inferSchema", "true").csv("src/test/data/movies2.txt")
println("Dataset with proper names")
df.show(5)
println("Dataset with misspelled names")
dfWithWrongTitles.show(5)
val joinedDF = df.join(dfWithWrongTitles, levenshteinFn(df("title"), dfWithWrongTitles("title")) < 5)
println("Dataset after fuzzy join")
joinedDF.show(5)
}
}