Skip to content

Commit

Permalink
dd
Browse files Browse the repository at this point in the history
  • Loading branch information
binbin.hou committed Nov 25, 2021
1 parent 175604d commit 1b46839
Show file tree
Hide file tree
Showing 9 changed files with 204 additions and 7 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,10 @@
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|:---|:---|:---|:---|:--|
| 1 | A | 常见基础实现 | 2019-02-20 21:40:43 | |


# release_1.1.0

| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|:---|:---|:---|:---|:--|
| 1 | A | 添加拆字实现 | 2021-11-25 21:40:43 | |
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ maven 3.x+
<dependency>
<groupId>com.github.houbb</groupId>
<artifactId>nlp-hanzi-similar</artifactId>
<version>1.0.0</version>
<version>1.1.0</version>
</dependency>
```

Expand Down Expand Up @@ -83,6 +83,7 @@ double rate = HanziSimilarBs.newInstance()
.bushouRate(6)
.bihuashuRate(2)
.pinyinRate(1)
.chaiziRate(8)
.similar('', '');
```

Expand Down Expand Up @@ -127,11 +128,12 @@ HanziSimilarBs 中允许自定义的配置列表如下:
| 10 | sijiaoRate | 四角编码权重 |
| 12 | sijiaoData | 四角编码数据 |
| 13 | sijiaoSimilar | 四角编码相似度策略 |
| 14 | pinyinRate | 拼音权重 |
| 15 | pinyinData | 拼音数据 |
| 16 | pinyinSimilar | 拼音相似度策略 |
| 17 | hanziSimilar | 汉字相似度核心策略 |
| 18 | userDefineData | 用户自定义数据 |
| 14 | pinyinData | 拼音数据 |
| 15 | pinyinSimilar | 拼音相似度策略 |
| 16 | hanziSimilar | 汉字相似度核心策略 |
| 17 | userDefineData | 用户自定义数据 |
| 18 | chaiziRate | 拆字比例 |
| 19 | chaiziSimlar | 拆字相似度 |

所有的配置都可以基于接口,用户进行自定义。

Expand Down
10 changes: 10 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@
<artifactId>pinyin</artifactId>
<version>0.2.2</version>
</dependency>
<dependency>
<groupId>com.github.houbb</groupId>
<artifactId>nlp-chaizi</artifactId>
<version>1.1.0</version>
</dependency>

<!--============================== OTHER ==============================-->
<dependency>
Expand Down Expand Up @@ -80,6 +85,11 @@
<artifactId>pinyin</artifactId>
</dependency>

<dependency>
<groupId>com.github.houbb</groupId>
<artifactId>nlp-chaizi</artifactId>
</dependency>

<!--============================== OTHER ==============================-->
<dependency>
<groupId>junit</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,18 @@ public interface IHanziSimilarContext {
*/
double pinyinRate();

/**
* 拆字相似度计算
* @return 相似度计算
*/
IHanziSimilar chaiziSimiar();

/**
* 拆字比例
* @return 比例
*/
double chaiziRate();

/**
* 用户自定义 数据
* @return 数据
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,20 @@ public static HanziSimilarBs newInstance() {
*/
private IHanziSimilar pinyinSimilar = HanziSimilars.pinyin();

/**
* 拆字占比
*/
private double chaiziRate = HanziSimilarRateConst.CHAIZI;

/**
* 拆字相似度实现
* @since 1.1.0
*/
private IHanziSimilar chaiziSimilar = HanziSimilars.chaizi();

/**
* 核心实现
* @since 1.1.0
*/
private IHanziSimilar hanziSimilar = Instances.singleton(HanziSimilar.class);

Expand Down Expand Up @@ -183,6 +195,16 @@ public HanziSimilarBs pinyinSimilar(IHanziSimilar pinyinSimilar) {
return this;
}

public HanziSimilarBs chaiziRate(double chaiziRate) {
this.chaiziRate = chaiziRate;
return this;
}

public HanziSimilarBs chaiziSimilar(IHanziSimilar chaiziSimilar) {
this.chaiziSimilar = chaiziSimilar;
return this;
}

public HanziSimilarBs hanziSimilar(IHanziSimilar hanziSimilar) {
this.hanziSimilar = hanziSimilar;
return this;
Expand Down Expand Up @@ -222,7 +244,9 @@ private IHanziSimilarContext buildContext(char one, char two) {
.sijiaoRate(sijiaoRate)
.sijiaoSimilar(sijiaoSimilar)
.pinyinRate(pinyinRate)
.pinyinSimilar(pinyinSimilar);
.pinyinSimilar(pinyinSimilar)
.chaiziRate(chaiziRate)
.chaiziSimiar(chaiziSimilar);

return context;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ private HanziSimilarRateConst(){}
*/
public static final double SIJIAO = 8.0;

/**
* 拆字
*/
public static final double CHAIZI = 6.0;

/**
* 偏旁部首
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package com.github.houbb.nlp.hanzi.similar.support.similar;

import com.github.houbb.heaven.util.lang.CharUtil;
import com.github.houbb.heaven.util.util.ArrayPrimitiveUtil;
import com.github.houbb.heaven.util.util.ArrayUtil;
import com.github.houbb.nlp.hanzi.similar.api.IHanziSimilar;
import com.github.houbb.nlp.hanzi.similar.api.IHanziSimilarContext;
import com.github.houbb.nlp.hanzi.similar.util.ChaiziHelper;

import java.util.List;
import java.util.Map;

/**
* 拆字
*
* A = {A1, A2, ..., Am}
* B = {B1, B2, ..., Bm}
*
* 每一个组成部分都有对应的笔画数(没有默认取1),所以有对应的权重。
*
* 得分应该如何计算呢?
*
* 长度:min(A, B) = m_AB
* 然后遍历,遍历元素。比如以 A 为准。
*
* A1 和 B1 相同,score_1 = A1_n/A_n + B1_n/B_n;
*
* 如何归一化?
*
* @author binbin.hou
* @since 1.0.0
*/
public class ChaiziSimilar implements IHanziSimilar {

@Override
public double similar(IHanziSimilarContext similarContext) {
String hanziOne = similarContext.charOne();
String hanziTwo = similarContext.charTwo();

int numberOne = getNumber(hanziOne, similarContext);
int numberTwo = getNumber(hanziTwo, similarContext);

// 拆分
char[] charsOne = getSplitChars(hanziOne);
char[] charsTwo = getSplitChars(hanziTwo);

int minLen = Math.min(charsOne.length, charsTwo.length);

// 比较
double totalScore = 0.0;
for(int i = 0; i < minLen; i++) {
char iChar = charsOne[i];
String textChar = iChar+"";
if(ArrayPrimitiveUtil.contains(charsTwo, iChar)) {
int textNumber = getNumber(textChar, similarContext);

double scoreOne = textNumber*1.0 / numberOne * 1.0;
double scoreTwo = textNumber*1.0 / numberTwo * 1.0;

totalScore += (scoreOne + scoreTwo) / 2.0;
}
}

return totalScore * similarContext.chaiziRate();
}

/**
* 获取拆分后对应的拆分字符
* @param charWord 字符
* @return 结果
*/
private char[] getSplitChars(String charWord) {
List<String> stringList = ChaiziHelper.chai(charWord.charAt(0));

// 这里应该选择哪一个是有讲究的。此处为了简单,默认选择第一个。
String string = stringList.get(0);

return string.toCharArray();
}

/**
* 获取笔画数
* @param text 文本
* @param similarContext 上下文
* @return 结果
*/
private int getNumber(String text, IHanziSimilarContext similarContext) {
Map<String, Integer> map = similarContext.bihuashuData().dataMap();

Integer number = map.get(text);
if(number == null) {
return 1;
}

return number;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,16 @@ public class HanziSimilarContext implements IHanziSimilarContext {
*/
private IHanziSimilar pinyinSimilar;

/**
* 拆字相似度计算
*/
private IHanziSimilar chaiziSimiar;

/**
* 拆字比例
*/
private double chaiziRate;

@Override
public String charOne() {
return charOne;
Expand Down Expand Up @@ -264,4 +274,24 @@ public HanziSimilarContext pinyinSimilar(IHanziSimilar pinyinSimilar) {
this.pinyinSimilar = pinyinSimilar;
return this;
}

@Override
public IHanziSimilar chaiziSimiar() {
return chaiziSimiar;
}

public HanziSimilarContext chaiziSimiar(IHanziSimilar chaiziSimiar) {
this.chaiziSimiar = chaiziSimiar;
return this;
}

@Override
public double chaiziRate() {
return chaiziRate;
}

public HanziSimilarContext chaiziRate(double chaiziRate) {
this.chaiziRate = chaiziRate;
return this;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,13 @@ public static IHanziSimilar sijiao() {
return Instances.singleton(SijiaoSimilar.class);
}

/**
* 拆字
* @return 实现
* @since 1.1.0
*/
public static IHanziSimilar chaizi() {
return Instances.singleton(ChaiziSimilar.class);
}

}

0 comments on commit 1b46839

Please sign in to comment.