返回目录

题目描述

给定一个连续不包含空格的字符串,该字符串仅包含英文小写字母及英文标点符号(逗号、分号、句号),同时给定词库,对该字符串进行精确分词。

说明:

  1. 精确分词:字符串分词后,不会出现重叠。即"ilovechina",不同词库可分割为"i,love,china","ilove,china",不能分割出现重叠的"i,ilove,china",i 出现重叠
  2. 标点符号不成词,仅用于断句
  3. 词库:根据外部知识库统计出来的常用词汇例:dictionary = ["i", "love", "china", "lovechina", "ilove"]
  4. 分词原则:采用分词顺序优先且最长匹配原则

"ilovechina",假设分词结果 [i,ilove,lo,love,ch,china,lovechina],则输出 [ilove,china]

错误输出:[i,lovechina],原因:"ilove" > 优先于 "lovechina" 成词

错误输出:[i,love,china],原因:"ilove" > "i"遵循最长匹配原则

输入描述

第一行输入待分词语句 "ilovechina"

  • 字符串长度限制:0 < length < 256

第二行输入中文词库 "i,love,china,ch,na,ve,lo,this,is,this,word"

  • 词库长度限制:1 < length < 100000

输出描述

按顺序输出分词结果 "i,love,china"

示例:

输入ilovechina
i,love,china,ch,na,ve,lo,this,is,the,word
输出i,love,china
说明
输入lat
i,love,china,ch,na,ve,lo,this,is,the,word,beauti,tiful,ful
输出i.a.t
说明单个字母,
不在词库中且不成词则输出单个字母

Python算法源码


class ListNode:
    def __init__(self, val):
        self.val = val  # 节点的值
        self.prev = None  # 指向前一个节点的引用
        self.next = None  # 指向下一个节点的引用

class LinkedList:
    def __init__(self):
        self.size = 0  # 链表的大小
        self.head = None  # 指向第一个节点的引用
        self.tail = None  # 指向最后一个节点的引用

    def add_first(self, val):
        node = ListNode(val)
        if self.size == 0:
            self.head = node
            self.tail = node
        else:
            node.next = self.head
            self.head.prev = node
            self.head = node
        self.size += 1

    def add_last(self, val):
        node = ListNode(val)
        if self.size == 0:
            self.head = node
            self.tail = node
        else:
            self.tail.next = node
            node.prev = self.tail
            self.tail = node
        self.size += 1

    def remove_first(self):
        if self.size == 0:
            exit(-1)  # 如果链表为空则退出并显示错误
        removed = self.head
        if self.size == 1:
            self.head = None
            self.tail = None
        else:
            self.head = self.head.next
            self.head.prev = None
        self.size -= 1
        return removed.val

    def remove_last(self):
        if self.size == 0:
            exit(-1)  # 如果链表为空则退出并显示错误
        removed = self.tail
        if self.size == 1:
            self.head = None
            self.tail = None
        else:
            self.tail = self.tail.prev
            self.tail.next = None
        self.size -= 1
        return removed.val

def substr(s, l, r):
    return s[l:r]  # 函数用于获取字符串s从索引l到r的子串

MAX_LENGTH = 256  # 字符串的最大长度
MAX_WORDS_LENGTH = 100000  # 字典中单词的最大数量
delimiter = ",.;"  # 用于拆分输入字符串的分隔符

def main():
    s1 = input().strip()  # 待分词的输入字符串

    # sentences 记录待分词的句子
    sentences = LinkedList()
    tokens1 = s1.split(delimiter)
    for token in tokens1:
        sentences.add_last(token)

    s2 = input().strip()  # 包含词典中单词的输入字符串

    # words 记录词典中的单词
    words = LinkedList()
    tokens2 = s2.split(delimiter)
    for token in tokens2:
        words.add_last(token)

    # ans 记录最终分词结果
    ans = LinkedList()

    while sentences.size > 0:
        sentence = sentences.remove_first()
        length = len(sentence)
        r = length

        while r > 0:
            # 从句子中提取子串[0,r)
            fragment = substr(sentence, 0, r)

            find = False

            cur = words.head
            while cur is not None:
                # 如果子串在词典中存在
                if cur.val == fragment:
                    # 将子串添加到结果中
                    ans.add_last(fragment)
                    cur.val = ""  # 从词典中移除已使用的单词

                    # 如果子串只是句子的一部分,则继续在词典中搜索剩余部分
                    if r < length:
                        sentences.add_first(substr(sentence, r, length))

                    find = True
                    break

                cur = cur.next

            if find:
                break

            r -= 1

        # 如果子串在词典中未找到,则输出单个字母
        if r == 0:
            # 将单个字母添加到结果中,并继续在词典中搜索剩余部分
            tmp = sentence[0]
            ans.add_last(tmp)
            if length > 1:
                sentences.add_first(substr(sentence, 1, length))

    cur = ans.head
    while cur is not None:
        print(cur.val, end="")
        cur = cur.next
        if cur is not None:
            print(",", end="")

if __name__ == "__main__":
    main()

C算法源码

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_SENTENCE_LEN 1000
#define MAX_WORD_LEN 100

// 分割字符串
char** splitString(char* str, char* delimiters, int* count) {
    char** result = (char**)malloc(sizeof(char*) * MAX_SENTENCE_LEN);
    char* token = strtok(str, delimiters);
    *count = 0;
  
    while (token != NULL) {
        result[(*count)++] = token;
        token = strtok(NULL, delimiters);
    }
  
    return result;
}

// 检查字符串是否在词库中
int isInWordSet(char* word, char** wordSet, int wordSetSize) {
    for (int i = 0; i < wordSetSize; ++i) {
        if (strcmp(word, wordSet[i]) == 0)
            return 1;
    }
    return 0;
}

// 获取结果
char* getResult(char** sentences, int sentenceCount, char** words, int wordCount) {
    // 创建词库副本
    char** wordSet = (char**)malloc(sizeof(char*) * wordCount);
    for (int i = 0; i < wordCount; ++i)
        wordSet[i] = words[i];

    // 创建结果字符串
    char* ans = (char*)malloc(sizeof(char) * MAX_SENTENCE_LEN);
    ans[0] = '\0';
  
    while (sentenceCount > 0) {
        char* sentence = sentences[--sentenceCount];
        int len = strlen(sentence);
      
        while (len > 0) {
            // 提取句子的前缀作为可能的词
            char* fragment = (char*)malloc(sizeof(char) * (MAX_WORD_LEN + 1));
            strncpy(fragment, sentence, len);
            fragment[len] = '\0';
          
            // 如果该词在词库中,则加入结果中
            if (isInWordSet(fragment, wordSet, wordCount)) {
                strcat(ans, fragment);
                strcat(ans, ",");
              
                // 从词库中移除该词
                for (int i = 0; i < wordCount; ++i) {
                    if (strcmp(wordSet[i], fragment) == 0) {
                        free(wordSet[i]);
                        wordSet[i] = wordSet[--wordCount];
                        break;
                    }
                }
              
                // 将剩余部分重新加入待处理句子中
                if (len < strlen(sentence)) {
                    char* remaining = (char*)malloc(sizeof(char) * (MAX_SENTENCE_LEN - len));
                    strcpy(remaining, sentence + len);
                    sentences[sentenceCount++] = remaining;
                }
              
                free(fragment);
                break;
            }
          
            free(fragment);
            --len;
        }
      
        // 如果句子无法继续切分,则处理为单个字符
        if (len == 0) {
            char* singleChar = (char*)malloc(sizeof(char) * 2);
            strncpy(singleChar, sentence, 1);
            singleChar[1] = '\0';
          
            strcat(ans, singleChar);
            strcat(ans, ",");
          
            // 将剩余部分重新加入待处理句子中
            if (strlen(sentence) > 1) {
                char* remaining = (char*)malloc(sizeof(char) * (MAX_SENTENCE_LEN - 1));
                strcpy(remaining, sentence + 1);
                sentences[sentenceCount++] = remaining;
            }
          
            free(singleChar);
        }
    }

    free(wordSet);
    return ans;
}

int main() {
    char inputSentence[MAX_SENTENCE_LEN];
    char inputWords[MAX_SENTENCE_LEN];

    // 输入句子和词库
    printf("请输入句子: ");
    fgets(inputSentence, MAX_SENTENCE_LEN, stdin);
    inputSentence[strcspn(inputSentence, "\n")] = 0; // 移除换行符

    printf("请输入词库: ");
    fgets(inputWords, MAX_SENTENCE_LEN, stdin);
    inputWords[strcspn(inputWords, "\n")] = 0; // 移除换行符

    int sentenceCount, wordCount;
    char** sentences = splitString(inputSentence, ",.;", &sentenceCount);
    char** words = splitString(inputWords, ",.;", &wordCount);

    // 获取结果并打印
    char* result = getResult(sentences, sentenceCount, words, wordCount);
    printf("%s\n", result);

    // 释放内存
    for (int i = 0; i < sentenceCount; ++i)
        free(sentences[i]);
    free(sentences);

    for (int i = 0; i < wordCount; ++i)
        free(words[i]);
    free(words);

    free(result);

    return 0;
}

Java算法源码


import java.util.*;

public class Main {
    private static final int MAX_LENGTH = 256;
    private static final int MAX_WORDS_LENGTH = 100000;
    private static final String delimiter = ",.;";
    private static final int TRUE = 1;
    private static final int FALSE = 0;

    /** 实现双向链表 **/
    static class ListNode {
        String val;
        ListNode prev;
        ListNode next;

        ListNode(String val) {
            this.val = val;
            this.prev = null;
            this.next = null;
        }
    }

    static class LinkedList {
        int size;
        ListNode head;
        ListNode tail;

        LinkedList() {
            this.size = 0;
            this.head = null;
            this.tail = null;
        }
    }

    // 创建新的链表节点
    static ListNode newListNode(String val) {
        ListNode node = new ListNode(val);
        node.val = val;
        node.prev = null;
        node.next = null;
        return node;
    }

    // 创建新的链表
    static LinkedList newLinkedList() {
        LinkedList link = new LinkedList();
        link.size = 0;
        link.head = null;
        link.tail = null;
        return link;
    }

    // 在链表头部添加节点
    static void addFirstLinkedList(LinkedList link, String val) {
        ListNode node = newListNode(val);

        if (link.size == 0) {
            link.head = node;
            link.tail = node;
        } else {
            node.next = link.head;
            link.head.prev = node;
            link.head = node;
        }

        link.size++;
    }

    // 在链表尾部添加节点
    static void addLastLinkedList(LinkedList link, String val) {
        ListNode node = newListNode(val);

        if (link.size == 0) {
            link.head = node;
            link.tail = node;
        } else {
            link.tail.next = node;
            node.prev = link.tail;
            link.tail = node;
        }

        link.size++;
    }

    // 移除链表头部节点
    static String removeFirstLinkedList(LinkedList link) {
        if (link.size == 0) System.exit(-1);

        ListNode removed = link.head;

        if (link.size == 1) {
            link.head = null;
            link.tail = null;
        } else {
            link.head = link.head.next;
            link.head.prev = null;
        }

        link.size--;

        String res = removed.val;

        return res;
    }

    // 移除链表尾部节点
    static String removeLastLinkedList(LinkedList link) {
        if (link.size == 0) System.exit(-1);

        ListNode removed = link.tail;

        if (link.size == 1) {
            link.head = null;
            link.tail = null;
        } else {
            link.tail = link.tail.prev;
            link.tail.next = null;
        }

        link.size--;

        String res = removed.val;

        return res;
    }

    /** 实现字符串截取 **/
    static String substr(String s, int l, int r) {
        int len = r - l;
        return s.substring(l, r);
    }

    public static void main(String[] args) {
        Scanner scanner = new Scanner(System.in);

        String s1 = scanner.nextLine();

        // sentences记录待分词语句
        LinkedList sentences = newLinkedList();
        StringTokenizer st1 = new StringTokenizer(s1, delimiter);
        while (st1.hasMoreTokens()) {
            addLastLinkedList(sentences, st1.nextToken());
        }

        String s2 = scanner.nextLine();

        // words 记录词库词汇
        LinkedList words = newLinkedList();
        StringTokenizer st2 = new StringTokenizer(s2, delimiter);
        while (st2.hasMoreTokens()) {
            addLastLinkedList(words, st2.nextToken());
        }

        // ans记录最终分词结果
        LinkedList ans = newLinkedList();

        while (sentences.size > 0) {
            String sentence = removeFirstLinkedList(sentences);

            int len = sentence.length();
            int r = len;

            for (; r > 0; r--) {
                // 截取句子 [0,r) 范围子串词汇, 这样的就能实现优先最长匹配,并且由于是必须从0索引开始截取,因此满足了分词顺序优先
                String fragment = substr(sentence, 0, r);

                boolean find = false;

                ListNode cur = words.head;
                while (cur != null) {
                    // 若词库中是否存在该子串词汇
                    if (cur.val.equals(fragment)) {
                        // 则将对应子串词汇纳入结果
                        addLastLinkedList(ans, fragment);
                        cur.val = ""; // 我理解词库中每个词汇只能使用一次,因此这里将词库中使用过的词汇移除

                        // 若子串词汇只是句子部分,则句子剩余部分还要继续去词库中查找
                        if (r < len) {
                            addFirstLinkedList(sentences, substr(sentence, r, len));
                        }

                        find = true;

                        break;
                    }
                    cur = cur.next;
                }

                if (find) {
                    break;
                }
            }

            // 没有在词库中找到对应子串词汇,则输出单个字母
            if (r == 0) {
                // 注意,这里只放一个字母到结果中,句子剩余部分继续去词库中查找
                String tmp = sentence.substring(0, 1);
                addLastLinkedList(ans, tmp);

                if (len > 1) {
                    addFirstLinkedList(sentences, substr(sentence, 1, len));
                }
            }
        }

        ListNode cur = ans.head;

        while (cur != null) {
            System.out.print(cur.val);
            cur = cur.next;
            if (cur != null) {
                System.out.print(",");
            }
        }
    }
}
最后修改:2024 年 04 月 01 日
如果觉得我的文章对你有用,请随意赞赏