返回目录
题目描述
给定一个连续不包含空格的字符串,该字符串仅包含英文小写字母及英文标点符号(逗号、分号、句号),同时给定词库,对该字符串进行精确分词。
说明:
- 精确分词:字符串分词后,不会出现重叠。即"ilovechina",不同词库可分割为"i,love,china","ilove,china",不能分割出现重叠的"i,ilove,china",i 出现重叠
- 标点符号不成词,仅用于断句
- 词库:根据外部知识库统计出来的常用词汇例:dictionary = ["i", "love", "china", "lovechina", "ilove"]
- 分词原则:采用分词顺序优先且最长匹配原则
"ilovechina",假设分词结果 [i,ilove,lo,love,ch,china,lovechina],则输出 [ilove,china]
错误输出:[i,lovechina],原因:"ilove" > 优先于 "lovechina" 成词
错误输出:[i,love,china],原因:"ilove" > "i"遵循最长匹配原则
输入描述
第一行输入待分词语句 "ilovechina"
- 字符串长度限制:0 < length < 256
第二行输入中文词库 "i,love,china,ch,na,ve,lo,this,is,this,word"
- 词库长度限制:1 < length < 100000
输出描述
按顺序输出分词结果 "i,love,china"
示例:
输入 | ilovechina i,love,china,ch,na,ve,lo,this,is,the,word |
---|---|
输出 | i,love,china |
说明 | 无 |
输入 | lat i,love,china,ch,na,ve,lo,this,is,the,word,beauti,tiful,ful |
---|---|
输出 | i.a.t |
说明 | 单个字母, 不在词库中且不成词则输出单个字母 |
Python算法源码
class ListNode:
def __init__(self, val):
self.val = val # 节点的值
self.prev = None # 指向前一个节点的引用
self.next = None # 指向下一个节点的引用
class LinkedList:
def __init__(self):
self.size = 0 # 链表的大小
self.head = None # 指向第一个节点的引用
self.tail = None # 指向最后一个节点的引用
def add_first(self, val):
node = ListNode(val)
if self.size == 0:
self.head = node
self.tail = node
else:
node.next = self.head
self.head.prev = node
self.head = node
self.size += 1
def add_last(self, val):
node = ListNode(val)
if self.size == 0:
self.head = node
self.tail = node
else:
self.tail.next = node
node.prev = self.tail
self.tail = node
self.size += 1
def remove_first(self):
if self.size == 0:
exit(-1) # 如果链表为空则退出并显示错误
removed = self.head
if self.size == 1:
self.head = None
self.tail = None
else:
self.head = self.head.next
self.head.prev = None
self.size -= 1
return removed.val
def remove_last(self):
if self.size == 0:
exit(-1) # 如果链表为空则退出并显示错误
removed = self.tail
if self.size == 1:
self.head = None
self.tail = None
else:
self.tail = self.tail.prev
self.tail.next = None
self.size -= 1
return removed.val
def substr(s, l, r):
return s[l:r] # 函数用于获取字符串s从索引l到r的子串
MAX_LENGTH = 256 # 字符串的最大长度
MAX_WORDS_LENGTH = 100000 # 字典中单词的最大数量
delimiter = ",.;" # 用于拆分输入字符串的分隔符
def main():
s1 = input().strip() # 待分词的输入字符串
# sentences 记录待分词的句子
sentences = LinkedList()
tokens1 = s1.split(delimiter)
for token in tokens1:
sentences.add_last(token)
s2 = input().strip() # 包含词典中单词的输入字符串
# words 记录词典中的单词
words = LinkedList()
tokens2 = s2.split(delimiter)
for token in tokens2:
words.add_last(token)
# ans 记录最终分词结果
ans = LinkedList()
while sentences.size > 0:
sentence = sentences.remove_first()
length = len(sentence)
r = length
while r > 0:
# 从句子中提取子串[0,r)
fragment = substr(sentence, 0, r)
find = False
cur = words.head
while cur is not None:
# 如果子串在词典中存在
if cur.val == fragment:
# 将子串添加到结果中
ans.add_last(fragment)
cur.val = "" # 从词典中移除已使用的单词
# 如果子串只是句子的一部分,则继续在词典中搜索剩余部分
if r < length:
sentences.add_first(substr(sentence, r, length))
find = True
break
cur = cur.next
if find:
break
r -= 1
# 如果子串在词典中未找到,则输出单个字母
if r == 0:
# 将单个字母添加到结果中,并继续在词典中搜索剩余部分
tmp = sentence[0]
ans.add_last(tmp)
if length > 1:
sentences.add_first(substr(sentence, 1, length))
cur = ans.head
while cur is not None:
print(cur.val, end="")
cur = cur.next
if cur is not None:
print(",", end="")
if __name__ == "__main__":
main()
C算法源码
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_SENTENCE_LEN 1000
#define MAX_WORD_LEN 100
// 分割字符串
char** splitString(char* str, char* delimiters, int* count) {
char** result = (char**)malloc(sizeof(char*) * MAX_SENTENCE_LEN);
char* token = strtok(str, delimiters);
*count = 0;
while (token != NULL) {
result[(*count)++] = token;
token = strtok(NULL, delimiters);
}
return result;
}
// 检查字符串是否在词库中
int isInWordSet(char* word, char** wordSet, int wordSetSize) {
for (int i = 0; i < wordSetSize; ++i) {
if (strcmp(word, wordSet[i]) == 0)
return 1;
}
return 0;
}
// 获取结果
char* getResult(char** sentences, int sentenceCount, char** words, int wordCount) {
// 创建词库副本
char** wordSet = (char**)malloc(sizeof(char*) * wordCount);
for (int i = 0; i < wordCount; ++i)
wordSet[i] = words[i];
// 创建结果字符串
char* ans = (char*)malloc(sizeof(char) * MAX_SENTENCE_LEN);
ans[0] = '\0';
while (sentenceCount > 0) {
char* sentence = sentences[--sentenceCount];
int len = strlen(sentence);
while (len > 0) {
// 提取句子的前缀作为可能的词
char* fragment = (char*)malloc(sizeof(char) * (MAX_WORD_LEN + 1));
strncpy(fragment, sentence, len);
fragment[len] = '\0';
// 如果该词在词库中,则加入结果中
if (isInWordSet(fragment, wordSet, wordCount)) {
strcat(ans, fragment);
strcat(ans, ",");
// 从词库中移除该词
for (int i = 0; i < wordCount; ++i) {
if (strcmp(wordSet[i], fragment) == 0) {
free(wordSet[i]);
wordSet[i] = wordSet[--wordCount];
break;
}
}
// 将剩余部分重新加入待处理句子中
if (len < strlen(sentence)) {
char* remaining = (char*)malloc(sizeof(char) * (MAX_SENTENCE_LEN - len));
strcpy(remaining, sentence + len);
sentences[sentenceCount++] = remaining;
}
free(fragment);
break;
}
free(fragment);
--len;
}
// 如果句子无法继续切分,则处理为单个字符
if (len == 0) {
char* singleChar = (char*)malloc(sizeof(char) * 2);
strncpy(singleChar, sentence, 1);
singleChar[1] = '\0';
strcat(ans, singleChar);
strcat(ans, ",");
// 将剩余部分重新加入待处理句子中
if (strlen(sentence) > 1) {
char* remaining = (char*)malloc(sizeof(char) * (MAX_SENTENCE_LEN - 1));
strcpy(remaining, sentence + 1);
sentences[sentenceCount++] = remaining;
}
free(singleChar);
}
}
free(wordSet);
return ans;
}
int main() {
char inputSentence[MAX_SENTENCE_LEN];
char inputWords[MAX_SENTENCE_LEN];
// 输入句子和词库
printf("请输入句子: ");
fgets(inputSentence, MAX_SENTENCE_LEN, stdin);
inputSentence[strcspn(inputSentence, "\n")] = 0; // 移除换行符
printf("请输入词库: ");
fgets(inputWords, MAX_SENTENCE_LEN, stdin);
inputWords[strcspn(inputWords, "\n")] = 0; // 移除换行符
int sentenceCount, wordCount;
char** sentences = splitString(inputSentence, ",.;", &sentenceCount);
char** words = splitString(inputWords, ",.;", &wordCount);
// 获取结果并打印
char* result = getResult(sentences, sentenceCount, words, wordCount);
printf("%s\n", result);
// 释放内存
for (int i = 0; i < sentenceCount; ++i)
free(sentences[i]);
free(sentences);
for (int i = 0; i < wordCount; ++i)
free(words[i]);
free(words);
free(result);
return 0;
}
Java算法源码
import java.util.*;
public class Main {
private static final int MAX_LENGTH = 256;
private static final int MAX_WORDS_LENGTH = 100000;
private static final String delimiter = ",.;";
private static final int TRUE = 1;
private static final int FALSE = 0;
/** 实现双向链表 **/
static class ListNode {
String val;
ListNode prev;
ListNode next;
ListNode(String val) {
this.val = val;
this.prev = null;
this.next = null;
}
}
static class LinkedList {
int size;
ListNode head;
ListNode tail;
LinkedList() {
this.size = 0;
this.head = null;
this.tail = null;
}
}
// 创建新的链表节点
static ListNode newListNode(String val) {
ListNode node = new ListNode(val);
node.val = val;
node.prev = null;
node.next = null;
return node;
}
// 创建新的链表
static LinkedList newLinkedList() {
LinkedList link = new LinkedList();
link.size = 0;
link.head = null;
link.tail = null;
return link;
}
// 在链表头部添加节点
static void addFirstLinkedList(LinkedList link, String val) {
ListNode node = newListNode(val);
if (link.size == 0) {
link.head = node;
link.tail = node;
} else {
node.next = link.head;
link.head.prev = node;
link.head = node;
}
link.size++;
}
// 在链表尾部添加节点
static void addLastLinkedList(LinkedList link, String val) {
ListNode node = newListNode(val);
if (link.size == 0) {
link.head = node;
link.tail = node;
} else {
link.tail.next = node;
node.prev = link.tail;
link.tail = node;
}
link.size++;
}
// 移除链表头部节点
static String removeFirstLinkedList(LinkedList link) {
if (link.size == 0) System.exit(-1);
ListNode removed = link.head;
if (link.size == 1) {
link.head = null;
link.tail = null;
} else {
link.head = link.head.next;
link.head.prev = null;
}
link.size--;
String res = removed.val;
return res;
}
// 移除链表尾部节点
static String removeLastLinkedList(LinkedList link) {
if (link.size == 0) System.exit(-1);
ListNode removed = link.tail;
if (link.size == 1) {
link.head = null;
link.tail = null;
} else {
link.tail = link.tail.prev;
link.tail.next = null;
}
link.size--;
String res = removed.val;
return res;
}
/** 实现字符串截取 **/
static String substr(String s, int l, int r) {
int len = r - l;
return s.substring(l, r);
}
public static void main(String[] args) {
Scanner scanner = new Scanner(System.in);
String s1 = scanner.nextLine();
// sentences记录待分词语句
LinkedList sentences = newLinkedList();
StringTokenizer st1 = new StringTokenizer(s1, delimiter);
while (st1.hasMoreTokens()) {
addLastLinkedList(sentences, st1.nextToken());
}
String s2 = scanner.nextLine();
// words 记录词库词汇
LinkedList words = newLinkedList();
StringTokenizer st2 = new StringTokenizer(s2, delimiter);
while (st2.hasMoreTokens()) {
addLastLinkedList(words, st2.nextToken());
}
// ans记录最终分词结果
LinkedList ans = newLinkedList();
while (sentences.size > 0) {
String sentence = removeFirstLinkedList(sentences);
int len = sentence.length();
int r = len;
for (; r > 0; r--) {
// 截取句子 [0,r) 范围子串词汇, 这样的就能实现优先最长匹配,并且由于是必须从0索引开始截取,因此满足了分词顺序优先
String fragment = substr(sentence, 0, r);
boolean find = false;
ListNode cur = words.head;
while (cur != null) {
// 若词库中是否存在该子串词汇
if (cur.val.equals(fragment)) {
// 则将对应子串词汇纳入结果
addLastLinkedList(ans, fragment);
cur.val = ""; // 我理解词库中每个词汇只能使用一次,因此这里将词库中使用过的词汇移除
// 若子串词汇只是句子部分,则句子剩余部分还要继续去词库中查找
if (r < len) {
addFirstLinkedList(sentences, substr(sentence, r, len));
}
find = true;
break;
}
cur = cur.next;
}
if (find) {
break;
}
}
// 没有在词库中找到对应子串词汇,则输出单个字母
if (r == 0) {
// 注意,这里只放一个字母到结果中,句子剩余部分继续去词库中查找
String tmp = sentence.substring(0, 1);
addLastLinkedList(ans, tmp);
if (len > 1) {
addFirstLinkedList(sentences, substr(sentence, 1, len));
}
}
}
ListNode cur = ans.head;
while (cur != null) {
System.out.print(cur.val);
cur = cur.next;
if (cur != null) {
System.out.print(",");
}
}
}
}
3 条评论
[...]2024 C卷 100分序号题目知 识 点难易程度1密码输入检测数据结构/栈☆☆☆2分配土地几何问题☆☆☆3找座位逻辑分析☆☆☆4智能成绩表动态条件分析☆☆☆5内存冷热标记多条件排序☆☆☆6螺旋数字矩阵逻辑分析☆☆☆7围棋的气逻辑分析☆☆☆8分割平衡字符串逻辑分析☆☆☆9机器人搬砖二分法☆☆☆10转盘寿司数据结构/栈/单调栈☆☆☆11小明找位置二分法☆☆☆12提取字符串的最长合法简单数学表达式双指[...]
[...]2024 C卷 100分序号题目知 识 点难易程度1密码输入检测数据结构/栈☆☆☆2分配土地几何问题☆☆☆3找座位逻辑分析☆☆☆4智能成绩表动态条件分析☆☆☆5内存冷热标记多条件排序☆☆☆6螺旋数字矩阵逻辑分析☆☆☆7围棋的气逻辑分析☆☆☆8分割平衡字符串逻辑分析☆☆☆9机器人搬砖二分法☆☆☆10转盘寿司数据结构/栈/单调栈☆☆☆11小明找位置二分法☆☆☆12提取字符串的最长合法简单数学表达式双指[...]
[...]2024 C卷 100分序号题目知 识 点难易程度1密码输入检测数据结构/栈☆☆☆2分配土地几何问题☆☆☆3找座位逻辑分析☆☆☆4智能成绩表动态条件分析☆☆☆5内存冷热标记多条件排序☆☆☆6螺旋数字矩阵逻辑分析☆☆☆7围棋的气逻辑分析☆☆☆8分割平衡字符串逻辑分析☆☆☆9机器人搬砖二分法☆☆☆10转盘寿司数据结构/栈/单调栈☆☆☆11小明找位置二分法☆☆☆12提取字符串的最长合法简单数学表达式双指[...]