中文分词模拟器

博主： od
发布时间：2024 年 04 月 01 日
356 次浏览
3 条评论
14597字数
分类：华为od机试真题

返回目录

题目描述

给定一个连续不包含空格的字符串，该字符串仅包含英文小写字母及英文标点符号（逗号、分号、句号），同时给定词库，对该字符串进行精确分词。

说明：

精确分词：字符串分词后，不会出现重叠。即"ilovechina"，不同词库可分割为"i,love,china"，"ilove,china"，不能分割出现重叠的"i,ilove,china"，i 出现重叠
标点符号不成词，仅用于断句
词库：根据外部知识库统计出来的常用词汇例：dictionary = ["i", "love", "china", "lovechina", "ilove"]
分词原则：采用分词顺序优先且最长匹配原则

"ilovechina"，假设分词结果 [i,ilove,lo,love,ch,china,lovechina]，则输出 [ilove,china]

错误输出：[i,lovechina]，原因："ilove" > 优先于 "lovechina" 成词

错误输出：[i,love,china]，原因："ilove" > "i"遵循最长匹配原则

输入描述

第一行输入待分词语句 "ilovechina"

字符串长度限制：0 < length < 256

第二行输入中文词库 "i,love,china,ch,na,ve,lo,this,is,this,word"

词库长度限制：1 < length < 100000

输出描述

按顺序输出分词结果 "i,love,china"

示例：

输入	ilovechina i,love,china,ch,na,ve,lo,this,is,the,word
输出	i,love,china
说明	无

输入	lat i,love,china,ch,na,ve,lo,this,is,the,word,beauti,tiful,ful
输出	i.a.t
说明	单个字母，不在词库中且不成词则输出单个字母

Python算法源码


class ListNode:
    def __init__(self, val):
        self.val = val  # 节点的值
        self.prev = None  # 指向前一个节点的引用
        self.next = None  # 指向下一个节点的引用

class LinkedList:
    def __init__(self):
        self.size = 0  # 链表的大小
        self.head = None  # 指向第一个节点的引用
        self.tail = None  # 指向最后一个节点的引用

    def add_first(self, val):
        node = ListNode(val)
        if self.size == 0:
            self.head = node
            self.tail = node
        else:
            node.next = self.head
            self.head.prev = node
            self.head = node
        self.size += 1

    def add_last(self, val):
        node = ListNode(val)
        if self.size == 0:
            self.head = node
            self.tail = node
        else:
            self.tail.next = node
            node.prev = self.tail
            self.tail = node
        self.size += 1

    def remove_first(self):
        if self.size == 0:
            exit(-1)  # 如果链表为空则退出并显示错误
        removed = self.head
        if self.size == 1:
            self.head = None
            self.tail = None
        else:
            self.head = self.head.next
            self.head.prev = None
        self.size -= 1
        return removed.val

    def remove_last(self):
        if self.size == 0:
            exit(-1)  # 如果链表为空则退出并显示错误
        removed = self.tail
        if self.size == 1:
            self.head = None
            self.tail = None
        else:
            self.tail = self.tail.prev
            self.tail.next = None
        self.size -= 1
        return removed.val

def substr(s, l, r):
    return s[l:r]  # 函数用于获取字符串s从索引l到r的子串

MAX_LENGTH = 256  # 字符串的最大长度
MAX_WORDS_LENGTH = 100000  # 字典中单词的最大数量
delimiter = ",.;"  # 用于拆分输入字符串的分隔符

def main():
    s1 = input().strip()  # 待分词的输入字符串

    # sentences 记录待分词的句子
    sentences = LinkedList()
    tokens1 = s1.split(delimiter)
    for token in tokens1:
        sentences.add_last(token)

    s2 = input().strip()  # 包含词典中单词的输入字符串

    # words 记录词典中的单词
    words = LinkedList()
    tokens2 = s2.split(delimiter)
    for token in tokens2:
        words.add_last(token)

    # ans 记录最终分词结果
    ans = LinkedList()

    while sentences.size > 0:
        sentence = sentences.remove_first()
        length = len(sentence)
        r = length

        while r > 0:
            # 从句子中提取子串[0,r)
            fragment = substr(sentence, 0, r)

            find = False

            cur = words.head
            while cur is not None:
                # 如果子串在词典中存在
                if cur.val == fragment:
                    # 将子串添加到结果中
                    ans.add_last(fragment)
                    cur.val = ""  # 从词典中移除已使用的单词

                    # 如果子串只是句子的一部分，则继续在词典中搜索剩余部分
                    if r < length:
                        sentences.add_first(substr(sentence, r, length))

                    find = True
                    break

                cur = cur.next

            if find:
                break

            r -= 1

        # 如果子串在词典中未找到，则输出单个字母
        if r == 0:
            # 将单个字母添加到结果中，并继续在词典中搜索剩余部分
            tmp = sentence[0]
            ans.add_last(tmp)
            if length > 1:
                sentences.add_first(substr(sentence, 1, length))

    cur = ans.head
    while cur is not None:
        print(cur.val, end="")
        cur = cur.next
        if cur is not None:
            print(",", end="")

if __name__ == "__main__":
    main()

C算法源码

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_SENTENCE_LEN 1000
#define MAX_WORD_LEN 100

// 分割字符串
char** splitString(char* str, char* delimiters, int* count) {
    char** result = (char**)malloc(sizeof(char*) * MAX_SENTENCE_LEN);
    char* token = strtok(str, delimiters);
    *count = 0;
  
    while (token != NULL) {
        result[(*count)++] = token;
        token = strtok(NULL, delimiters);
    }
  
    return result;
}

// 检查字符串是否在词库中
int isInWordSet(char* word, char** wordSet, int wordSetSize) {
    for (int i = 0; i < wordSetSize; ++i) {
        if (strcmp(word, wordSet[i]) == 0)
            return 1;
    }
    return 0;
}

// 获取结果
char* getResult(char** sentences, int sentenceCount, char** words, int wordCount) {
    // 创建词库副本
    char** wordSet = (char**)malloc(sizeof(char*) * wordCount);
    for (int i = 0; i < wordCount; ++i)
        wordSet[i] = words[i];

    // 创建结果字符串
    char* ans = (char*)malloc(sizeof(char) * MAX_SENTENCE_LEN);
    ans[0] = '\0';
  
    while (sentenceCount > 0) {
        char* sentence = sentences[--sentenceCount];
        int len = strlen(sentence);
      
        while (len > 0) {
            // 提取句子的前缀作为可能的词
            char* fragment = (char*)malloc(sizeof(char) * (MAX_WORD_LEN + 1));
            strncpy(fragment, sentence, len);
            fragment[len] = '\0';
          
            // 如果该词在词库中，则加入结果中
            if (isInWordSet(fragment, wordSet, wordCount)) {
                strcat(ans, fragment);
                strcat(ans, ",");
              
                // 从词库中移除该词
                for (int i = 0; i < wordCount; ++i) {
                    if (strcmp(wordSet[i], fragment) == 0) {
                        free(wordSet[i]);
                        wordSet[i] = wordSet[--wordCount];
                        break;
                    }
                }
              
                // 将剩余部分重新加入待处理句子中
                if (len < strlen(sentence)) {
                    char* remaining = (char*)malloc(sizeof(char) * (MAX_SENTENCE_LEN - len));
                    strcpy(remaining, sentence + len);
                    sentences[sentenceCount++] = remaining;
                }
              
                free(fragment);
                break;
            }
          
            free(fragment);
            --len;
        }
      
        // 如果句子无法继续切分，则处理为单个字符
        if (len == 0) {
            char* singleChar = (char*)malloc(sizeof(char) * 2);
            strncpy(singleChar, sentence, 1);
            singleChar[1] = '\0';
          
            strcat(ans, singleChar);
            strcat(ans, ",");
          
            // 将剩余部分重新加入待处理句子中
            if (strlen(sentence) > 1) {
                char* remaining = (char*)malloc(sizeof(char) * (MAX_SENTENCE_LEN - 1));
                strcpy(remaining, sentence + 1);
                sentences[sentenceCount++] = remaining;
            }
          
            free(singleChar);
        }
    }

    free(wordSet);
    return ans;
}

int main() {
    char inputSentence[MAX_SENTENCE_LEN];
    char inputWords[MAX_SENTENCE_LEN];

    // 输入句子和词库
    printf("请输入句子: ");
    fgets(inputSentence, MAX_SENTENCE_LEN, stdin);
    inputSentence[strcspn(inputSentence, "\n")] = 0; // 移除换行符

    printf("请输入词库: ");
    fgets(inputWords, MAX_SENTENCE_LEN, stdin);
    inputWords[strcspn(inputWords, "\n")] = 0; // 移除换行符

    int sentenceCount, wordCount;
    char** sentences = splitString(inputSentence, ",.;", &sentenceCount);
    char** words = splitString(inputWords, ",.;", &wordCount);

    // 获取结果并打印
    char* result = getResult(sentences, sentenceCount, words, wordCount);
    printf("%s\n", result);

    // 释放内存
    for (int i = 0; i < sentenceCount; ++i)
        free(sentences[i]);
    free(sentences);

    for (int i = 0; i < wordCount; ++i)
        free(words[i]);
    free(words);

    free(result);

    return 0;
}

Java算法源码


import java.util.*;

public class Main {
    private static final int MAX_LENGTH = 256;
    private static final int MAX_WORDS_LENGTH = 100000;
    private static final String delimiter = ",.;";
    private static final int TRUE = 1;
    private static final int FALSE = 0;

    /** 实现双向链表 **/
    static class ListNode {
        String val;
        ListNode prev;
        ListNode next;

        ListNode(String val) {
            this.val = val;
            this.prev = null;
            this.next = null;
        }
    }

    static class LinkedList {
        int size;
        ListNode head;
        ListNode tail;

        LinkedList() {
            this.size = 0;
            this.head = null;
            this.tail = null;
        }
    }

    // 创建新的链表节点
    static ListNode newListNode(String val) {
        ListNode node = new ListNode(val);
        node.val = val;
        node.prev = null;
        node.next = null;
        return node;
    }

    // 创建新的链表
    static LinkedList newLinkedList() {
        LinkedList link = new LinkedList();
        link.size = 0;
        link.head = null;
        link.tail = null;
        return link;
    }

    // 在链表头部添加节点
    static void addFirstLinkedList(LinkedList link, String val) {
        ListNode node = newListNode(val);

        if (link.size == 0) {
            link.head = node;
            link.tail = node;
        } else {
            node.next = link.head;
            link.head.prev = node;
            link.head = node;
        }

        link.size++;
    }

    // 在链表尾部添加节点
    static void addLastLinkedList(LinkedList link, String val) {
        ListNode node = newListNode(val);

        if (link.size == 0) {
            link.head = node;
            link.tail = node;
        } else {
            link.tail.next = node;
            node.prev = link.tail;
            link.tail = node;
        }

        link.size++;
    }

    // 移除链表头部节点
    static String removeFirstLinkedList(LinkedList link) {
        if (link.size == 0) System.exit(-1);

        ListNode removed = link.head;

        if (link.size == 1) {
            link.head = null;
            link.tail = null;
        } else {
            link.head = link.head.next;
            link.head.prev = null;
        }

        link.size--;

        String res = removed.val;

        return res;
    }

    // 移除链表尾部节点
    static String removeLastLinkedList(LinkedList link) {
        if (link.size == 0) System.exit(-1);

        ListNode removed = link.tail;

        if (link.size == 1) {
            link.head = null;
            link.tail = null;
        } else {
            link.tail = link.tail.prev;
            link.tail.next = null;
        }

        link.size--;

        String res = removed.val;

        return res;
    }

    /** 实现字符串截取 **/
    static String substr(String s, int l, int r) {
        int len = r - l;
        return s.substring(l, r);
    }

    public static void main(String[] args) {
        Scanner scanner = new Scanner(System.in);

        String s1 = scanner.nextLine();

        // sentences记录待分词语句
        LinkedList sentences = newLinkedList();
        StringTokenizer st1 = new StringTokenizer(s1, delimiter);
        while (st1.hasMoreTokens()) {
            addLastLinkedList(sentences, st1.nextToken());
        }

        String s2 = scanner.nextLine();

        // words 记录词库词汇
        LinkedList words = newLinkedList();
        StringTokenizer st2 = new StringTokenizer(s2, delimiter);
        while (st2.hasMoreTokens()) {
            addLastLinkedList(words, st2.nextToken());
        }

        // ans记录最终分词结果
        LinkedList ans = newLinkedList();

        while (sentences.size > 0) {
            String sentence = removeFirstLinkedList(sentences);

            int len = sentence.length();
            int r = len;

            for (; r > 0; r--) {
                // 截取句子 [0,r) 范围子串词汇, 这样的就能实现优先最长匹配，并且由于是必须从0索引开始截取，因此满足了分词顺序优先
                String fragment = substr(sentence, 0, r);

                boolean find = false;

                ListNode cur = words.head;
                while (cur != null) {
                    // 若词库中是否存在该子串词汇
                    if (cur.val.equals(fragment)) {
                        // 则将对应子串词汇纳入结果
                        addLastLinkedList(ans, fragment);
                        cur.val = ""; // 我理解词库中每个词汇只能使用一次，因此这里将词库中使用过的词汇移除

                        // 若子串词汇只是句子部分，则句子剩余部分还要继续去词库中查找
                        if (r < len) {
                            addFirstLinkedList(sentences, substr(sentence, r, len));
                        }

                        find = true;

                        break;
                    }
                    cur = cur.next;
                }

                if (find) {
                    break;
                }
            }

            // 没有在词库中找到对应子串词汇，则输出单个字母
            if (r == 0) {
                // 注意，这里只放一个字母到结果中，句子剩余部分继续去词库中查找
                String tmp = sentence.substring(0, 1);
                addLastLinkedList(ans, tmp);

                if (len > 1) {
                    addFirstLinkedList(sentences, substr(sentence, 1, len));
                }
            }
        }

        ListNode cur = ans.head;

        while (cur != null) {
            System.out.print(cur.val);
            cur = cur.next;
            if (cur != null) {
                System.out.print(",");
            }
        }
    }
}

最后修改：2024 年 04 月 01 日

如果觉得我的文章对你有用，请随意赞赏

3 条评论

华为OD2024机试目录 - 留学生Computer Science编程作业/assignment代写
April 15th, 2024 at 11:36 am

[...]2024 C卷 100分序号题目知识点难易程度1密码输入检测数据结构/栈☆☆☆2分配土地几何问题☆☆☆3找座位逻辑分析☆☆☆4智能成绩表动态条件分析☆☆☆5内存冷热标记多条件排序☆☆☆6螺旋数字矩阵逻辑分析☆☆☆7围棋的气逻辑分析☆☆☆8分割平衡字符串逻辑分析☆☆☆9机器人搬砖二分法☆☆☆10转盘寿司数据结构/栈/单调栈☆☆☆11小明找位置二分法☆☆☆12提取字符串的最长合法简单数学表达式双指[...]

回复
华为od2024机试目录 - 留学生Computer Science编程作业/assignment代写
April 15th, 2024 at 10:51 am

[...]2024 C卷 100分序号题目知识点难易程度1密码输入检测数据结构/栈☆☆☆2分配土地几何问题☆☆☆3找座位逻辑分析☆☆☆4智能成绩表动态条件分析☆☆☆5内存冷热标记多条件排序☆☆☆6螺旋数字矩阵逻辑分析☆☆☆7围棋的气逻辑分析☆☆☆8分割平衡字符串逻辑分析☆☆☆9机器人搬砖二分法☆☆☆10转盘寿司数据结构/栈/单调栈☆☆☆11小明找位置二分法☆☆☆12提取字符串的最长合法简单数学表达式双指[...]

回复
华为od 2024年机试 C卷 100分和200分 - 留学生Computer Science编程作业/assignment代写
April 1st, 2024 at 02:03 pm

[...]2024 C卷 100分序号题目知识点难易程度1密码输入检测数据结构/栈☆☆☆2分配土地几何问题☆☆☆3找座位逻辑分析☆☆☆4智能成绩表动态条件分析☆☆☆5内存冷热标记多条件排序☆☆☆6螺旋数字矩阵逻辑分析☆☆☆7围棋的气逻辑分析☆☆☆8分割平衡字符串逻辑分析☆☆☆9机器人搬砖二分法☆☆☆10转盘寿司数据结构/栈/单调栈☆☆☆11小明找位置二分法☆☆☆12提取字符串的最长合法简单数学表达式双指[...]

回复

发表评论取消回复
使用cookie技术保留您的个人信息以便您下次快速评论，继续评论表示您已同意该条款

评论 *

私密评论

名称 *

🎲

邮箱 *

地址

ukrittwaqd
哈哈哈，写的太好了https://www.cscnn.com/
pjyctuyqiz
文章的确不错啊https://www.cscnn.com/
omhrtqdrgu
兄弟写的非常好 https://www.cscnn.com/
fnfgrgnhov
哈哈哈，写的太好了https://www.cscnn.com/
pgmzsywcvi
哈哈哈，写的太好了https://www.cscnn.com/

中文分词模拟器

od • 2024 年 04 月 01 日

<h4><span class="external-link"><a class="no-external-link" href="http://csproject.icu/index.php/archives/112/" target="_blank"><i data-feather="external-link"></i>返回目录</a></span></h4><h4>题目描述</h4><p>给定一个连续不包含空格的字符串，该字符串仅包含英文小写字母及英文标点符号（逗号、分号、句号），同时给定词库，对该字符串进行精确分词。</p><p>说明：</p><ol><li>精确分词：字符串分词后，不会出现重叠。即"ilovechina"，不同词库可分割为"i,love,china"，"ilove,china"，不能分割出现重叠的"i,ilove,china"，i 出现重叠</li><li>标点符号不成词，仅用于断句</li><li>词库：根据外部知识库统计出来的常用词汇例：dictionary = ["i", "love", "china", "lovechina", "ilove"]</li><li>分词原则：采用分词顺序优先且最长匹配原则</li></ol><p>"ilovechina"，假设分词结果 [i,ilove,lo,love,ch,china,lovechina]，则输出 [ilove,china]</p><p>错误输出：[i,lovechina]，原因："ilove" &gt; 优先于 "lovechina" 成词</p><p>错误输出：[i,love,china]，原因："ilove" &gt; "i"遵循最长匹配原则</p><h4>输入描述</h4><p>第一行输入待分词语句 "ilovechina"</p><ul><li>字符串长度限制：0 &lt; length &lt; 256</li></ul><p>第二行输入中文词库 "i,love,china,ch,na,ve,lo,this,is,this,word"</p><ul><li>词库长度限制：1 &lt; length &lt; 100000</li></ul><h4>输出描述</h4><p>按顺序输出分词结果 "i,love,china"</p><p>示例：</p><table><thead><tr><th>输入</th><th align="left">ilovechina<br />i,love,china,ch,na,ve,lo,this,is,the,word</th></tr></thead><tbody><tr><td>输出</td><td align="left">i,love,china</td></tr><tr><td>说明</td><td align="left">无</td></tr></tbody></table><table><thead><tr><th>输入</th><th align="left">lat<br />i,love,china,ch,na,ve,lo,this,is,the,word,beauti,tiful,ful</th></tr></thead><tbody><tr><td>输出</td><td align="left">i.a.t</td></tr><tr><td>说明</td><td align="left">单个字母，<br />不在词库中且不成词则输出单个字母</td></tr></tbody></table><h4>Python算法源码</h4><pre><code class="lang-python">
class ListNode:
    def __init__(self, val):
        self.val = val  # 节点的值
        self.prev = None  # 指向前一个节点的引用
        self.next = None  # 指向下一个节点的引用

class LinkedList:
    def __init__(self):
        self.size = 0  # 链表的大小
        self.head = None  # 指向第一个节点的引用
        self.tail = None  # 指向最后一个节点的引用

def add_first(self, val):
        node = ListNode(val)
        if self.size == 0:
            self.head = node
            self.tail = node
        else:
            node.next = self.head
            self.head.prev = node
            self.head = node
        self.size += 1

def add_last(self, val):
        node = ListNode(val)
        if self.size == 0:
            self.head = node
            self.tail = node
        else:
            self.tail.next = node
            node.prev = self.tail
            self.tail = node
        self.size += 1

def remove_first(self):
        if self.size == 0:
            exit(-1)  # 如果链表为空则退出并显示错误
        removed = self.head
        if self.size == 1:
            self.head = None
            self.tail = None
        else:
            self.head = self.head.next
            self.head.prev = None
        self.size -= 1
        return removed.val

def remove_last(self):
        if self.size == 0:
            exit(-1)  # 如果链表为空则退出并显示错误
        removed = self.tail
        if self.size == 1:
            self.head = None
            self.tail = None
        else:
            self.tail = self.tail.prev
            self.tail.next = None
        self.size -= 1
        return removed.val

def substr(s, l, r):
    return s[l:r]  # 函数用于获取字符串s从索引l到r的子串

MAX_LENGTH = 256  # 字符串的最大长度
MAX_WORDS_LENGTH = 100000  # 字典中单词的最大数量
delimiter = &quot;,.;&quot;  # 用于拆分输入字符串的分隔符

def main():
    s1 = input().strip()  # 待分词的输入字符串

# sentences 记录待分词的句子
    sentences = LinkedList()
    tokens1 = s1.split(delimiter)
    for token in tokens1:
        sentences.add_last(token)

s2 = input().strip()  # 包含词典中单词的输入字符串

# words 记录词典中的单词
    words = LinkedList()
    tokens2 = s2.split(delimiter)
    for token in tokens2:
        words.add_last(token)

# ans 记录最终分词结果
    ans = LinkedList()

while sentences.size &gt; 0:
        sentence = sentences.remove_first()
        length = len(sentence)
        r = length

while r &gt; 0:
            # 从句子中提取子串[0,r)
            fragment = substr(sentence, 0, r)

find = False

cur = words.head
            while cur is not None:
                # 如果子串在词典中存在
                if cur.val == fragment:
                    # 将子串添加到结果中
                    ans.add_last(fragment)
                    cur.val = &quot;&quot;  # 从词典中移除已使用的单词

# 如果子串只是句子的一部分，则继续在词典中搜索剩余部分
                    if r &lt; length:
                        sentences.add_first(substr(sentence, r, length))

find = True
                    break

cur = cur.next

if find:
                break

r -= 1

# 如果子串在词典中未找到，则输出单个字母
        if r == 0:
            # 将单个字母添加到结果中，并继续在词典中搜索剩余部分
            tmp = sentence[0]
            ans.add_last(tmp)
            if length &gt; 1:
                sentences.add_first(substr(sentence, 1, length))

cur = ans.head
    while cur is not None:
        print(cur.val, end=&quot;&quot;)
        cur = cur.next
        if cur is not None:
            print(&quot;,&quot;, end=&quot;&quot;)

if __name__ == &quot;__main__&quot;:
    main()</code></pre><h4>C算法源码</h4><pre><code class="lang-cpp">#include &lt;stdio.h&gt;
#include &lt;stdlib.h&gt;
#include &lt;string.h&gt;

#define MAX_SENTENCE_LEN 1000
#define MAX_WORD_LEN 100

// 分割字符串
char** splitString(char* str, char* delimiters, int* count) {
    char** result = (char**)malloc(sizeof(char*) * MAX_SENTENCE_LEN);
    char* token = strtok(str, delimiters);
    *count = 0;
  
    while (token != NULL) {
        result[(*count)++] = token;
        token = strtok(NULL, delimiters);
    }
  
    return result;
}

// 检查字符串是否在词库中
int isInWordSet(char* word, char** wordSet, int wordSetSize) {
    for (int i = 0; i &lt; wordSetSize; ++i) {
        if (strcmp(word, wordSet[i]) == 0)
            return 1;
    }
    return 0;
}

// 获取结果
char* getResult(char** sentences, int sentenceCount, char** words, int wordCount) {
    // 创建词库副本
    char** wordSet = (char**)malloc(sizeof(char*) * wordCount);
    for (int i = 0; i &lt; wordCount; ++i)
        wordSet[i] = words[i];

// 创建结果字符串
    char* ans = (char*)malloc(sizeof(char) * MAX_SENTENCE_LEN);
    ans[0] = '\0';
  
    while (sentenceCount &gt; 0) {
        char* sentence = sentences[--sentenceCount];
        int len = strlen(sentence);
      
        while (len &gt; 0) {
            // 提取句子的前缀作为可能的词
            char* fragment = (char*)malloc(sizeof(char) * (MAX_WORD_LEN + 1));
            strncpy(fragment, sentence, len);
            fragment[len] = '\0';
          
            // 如果该词在词库中，则加入结果中
            if (isInWordSet(fragment, wordSet, wordCount)) {
                strcat(ans, fragment);
                strcat(ans, &quot;,&quot;);
              
                // 从词库中移除该词
                for (int i = 0; i &lt; wordCount; ++i) {
                    if (strcmp(wordSet[i], fragment) == 0) {
                        free(wordSet[i]);
                        wordSet[i] = wordSet[--wordCount];
                        break;
                    }
                }
              
                // 将剩余部分重新加入待处理句子中
                if (len &lt; strlen(sentence)) {
                    char* remaining = (char*)malloc(sizeof(char) * (MAX_SENTENCE_LEN - len));
                    strcpy(remaining, sentence + len);
                    sentences[sentenceCount++] = remaining;
                }
              
                free(fragment);
                break;
            }
          
            free(fragment);
            --len;
        }
      
        // 如果句子无法继续切分，则处理为单个字符
        if (len == 0) {
            char* singleChar = (char*)malloc(sizeof(char) * 2);
            strncpy(singleChar, sentence, 1);
            singleChar[1] = '\0';
          
            strcat(ans, singleChar);
            strcat(ans, &quot;,&quot;);
          
            // 将剩余部分重新加入待处理句子中
            if (strlen(sentence) &gt; 1) {
                char* remaining = (char*)malloc(sizeof(char) * (MAX_SENTENCE_LEN - 1));
                strcpy(remaining, sentence + 1);
                sentences[sentenceCount++] = remaining;
            }
          
            free(singleChar);
        }
    }

free(wordSet);
    return ans;
}

int main() {
    char inputSentence[MAX_SENTENCE_LEN];
    char inputWords[MAX_SENTENCE_LEN];

// 输入句子和词库
    printf(&quot;请输入句子: &quot;);
    fgets(inputSentence, MAX_SENTENCE_LEN, stdin);
    inputSentence[strcspn(inputSentence, &quot;\n&quot;)] = 0; // 移除换行符

printf(&quot;请输入词库: &quot;);
    fgets(inputWords, MAX_SENTENCE_LEN, stdin);
    inputWords[strcspn(inputWords, &quot;\n&quot;)] = 0; // 移除换行符

int sentenceCount, wordCount;
    char** sentences = splitString(inputSentence, &quot;,.;&quot;, &amp;sentenceCount);
    char** words = splitString(inputWords, &quot;,.;&quot;, &amp;wordCount);

// 获取结果并打印
    char* result = getResult(sentences, sentenceCount, words, wordCount);
    printf(&quot;%s\n&quot;, result);

// 释放内存
    for (int i = 0; i &lt; sentenceCount; ++i)
        free(sentences[i]);
    free(sentences);

for (int i = 0; i &lt; wordCount; ++i)
        free(words[i]);
    free(words);

free(result);

return 0;
}</code></pre><h4>Java算法源码</h4><pre><code class="lang-java">
import java.util.*;

public class Main {
    private static final int MAX_LENGTH = 256;
    private static final int MAX_WORDS_LENGTH = 100000;
    private static final String delimiter = &quot;,.;&quot;;
    private static final int TRUE = 1;
    private static final int FALSE = 0;

/** 实现双向链表 **/
    static class ListNode {
        String val;
        ListNode prev;
        ListNode next;

ListNode(String val) {
            this.val = val;
            this.prev = null;
            this.next = null;
        }
    }

static class LinkedList {
        int size;
        ListNode head;
        ListNode tail;

LinkedList() {
            this.size = 0;
            this.head = null;
            this.tail = null;
        }
    }

// 创建新的链表节点
    static ListNode newListNode(String val) {
        ListNode node = new ListNode(val);
        node.val = val;
        node.prev = null;
        node.next = null;
        return node;
    }

// 创建新的链表
    static LinkedList newLinkedList() {
        LinkedList link = new LinkedList();
        link.size = 0;
        link.head = null;
        link.tail = null;
        return link;
    }

// 在链表头部添加节点
    static void addFirstLinkedList(LinkedList link, String val) {
        ListNode node = newListNode(val);

if (link.size == 0) {
            link.head = node;
            link.tail = node;
        } else {
            node.next = link.head;
            link.head.prev = node;
            link.head = node;
        }

link.size++;
    }

// 在链表尾部添加节点
    static void addLastLinkedList(LinkedList link, String val) {
        ListNode node = newListNode(val);

if (link.size == 0) {
            link.head = node;
            link.tail = node;
        } else {
            link.tail.next = node;
            node.prev = link.tail;
            link.tail = node;
        }

link.size++;
    }

// 移除链表头部节点
    static String removeFirstLinkedList(LinkedList link) {
        if (link.size == 0) System.exit(-1);

ListNode removed = link.head;

if (link.size == 1) {
            link.head = null;
            link.tail = null;
        } else {
            link.head = link.head.next;
            link.head.prev = null;
        }

link.size--;

String res = removed.val;

return res;
    }

// 移除链表尾部节点
    static String removeLastLinkedList(LinkedList link) {
        if (link.size == 0) System.exit(-1);

ListNode removed = link.tail;

if (link.size == 1) {
            link.head = null;
            link.tail = null;
        } else {
            link.tail = link.tail.prev;
            link.tail.next = null;
        }

link.size--;

String res = removed.val;

return res;
    }

/** 实现字符串截取 **/
    static String substr(String s, int l, int r) {
        int len = r - l;
        return s.substring(l, r);
    }

public static void main(String[] args) {
        Scanner scanner = new Scanner(System.in);

String s1 = scanner.nextLine();

// sentences记录待分词语句
        LinkedList sentences = newLinkedList();
        StringTokenizer st1 = new StringTokenizer(s1, delimiter);
        while (st1.hasMoreTokens()) {
            addLastLinkedList(sentences, st1.nextToken());
        }

String s2 = scanner.nextLine();

// words 记录词库词汇
        LinkedList words = newLinkedList();
        StringTokenizer st2 = new StringTokenizer(s2, delimiter);
        while (st2.hasMoreTokens()) {
            addLastLinkedList(words, st2.nextToken());
        }

// ans记录最终分词结果
        LinkedList ans = newLinkedList();

while (sentences.size &gt; 0) {
            String sentence = removeFirstLinkedList(sentences);

int len = sentence.length();
            int r = len;

for (; r &gt; 0; r--) {
                // 截取句子 [0,r) 范围子串词汇, 这样的就能实现优先最长匹配，并且由于是必须从0索引开始截取，因此满足了分词顺序优先
                String fragment = substr(sentence, 0, r);

boolean find = false;

ListNode cur = words.head;
                while (cur != null) {
                    // 若词库中是否存在该子串词汇
                    if (cur.val.equals(fragment)) {
                        // 则将对应子串词汇纳入结果
                        addLastLinkedList(ans, fragment);
                        cur.val = &quot;&quot;; // 我理解词库中每个词汇只能使用一次，因此这里将词库中使用过的词汇移除

// 若子串词汇只是句子部分，则句子剩余部分还要继续去词库中查找
                        if (r &lt; len) {
                            addFirstLinkedList(sentences, substr(sentence, r, len));
                        }

find = true;

break;
                    }
                    cur = cur.next;
                }

if (find) {
                    break;
                }
            }

// 没有在词库中找到对应子串词汇，则输出单个字母
            if (r == 0) {
                // 注意，这里只放一个字母到结果中，句子剩余部分继续去词库中查找
                String tmp = sentence.substring(0, 1);
                addLastLinkedList(ans, tmp);

if (len &gt; 1) {
                    addFirstLinkedList(sentences, substr(sentence, 1, len));
                }
            }
        }

ListNode cur = ans.head;

while (cur != null) {
            System.out.print(cur.val);
            cur = cur.next;
            if (cur != null) {
                System.out.print(&quot;,&quot;);
            }
        }
    }
}</code></pre>

中文分词模拟器

返回目录

题目描述

输入描述

输出描述

Python算法源码

C算法源码

Java算法源码

3 条评论

发表评论取消回复
使用cookie技术保留您的个人信息以便您下次快速评论，继续评论表示您已同意该条款

华为od C卷 100分200分

华为OD技术面真题

内存冷热标记

计算三叉搜索树的高度

剩余银饰的重量

学生重新排队

最长子字符串的长度（一）

伐木工

可以组成网络的服务器

数组子集

中文分词模拟器

题目描述

输入描述

输出描述

Python算法源码

C算法源码

Java算法源码

3 条评论

发表评论 取消回复 使用cookie技术保留您的个人信息以便您下次快速评论，继续评论表示您已同意该条款

中文分词模拟器

发表评论取消回复
使用cookie技术保留您的个人信息以便您下次快速评论，继续评论表示您已同意该条款