From c1f249e103edb68c5ab31578610174612a6db815 Mon Sep 17 00:00:00 2001 From: anon Date: Sun, 10 Mar 2024 15:57:35 +0100 Subject: [PATCH] Added levenshtein_dist_-_usage.cpp --- levenshtein_dist_-_usage.cpp | 74 ++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 levenshtein_dist_-_usage.cpp diff --git a/levenshtein_dist_-_usage.cpp b/levenshtein_dist_-_usage.cpp new file mode 100644 index 0000000..304033d --- /dev/null +++ b/levenshtein_dist_-_usage.cpp @@ -0,0 +1,74 @@ +// @BAKE g++ $@ + +#include +#include +#include + +using namespace std; + +// https://github.com/Meteorix/pylcs + +vector utf8_split(const string &str){ + vector split; + int len = str.length(); + int left = 0; + int right = 1; + + for (int i = 0; i < len; i++){ + if (right >= len || ((str[right] & 0xc0) != 0x80)){ + string s = str.substr(left, right - left); + split.push_back(s); + // printf("%s %d %d\n", s.c_str(), left, right); + left = right; + } + right ++; + } + return split; +} + +// https://github.com/schiffma/distlib +int mini(int a, int b, int c){ + return(min(a, min(b,c))); +} + +int levenshtein_dist(const string &word1, const string &word2){ + /// + /// Please use lower-case strings + /// word1 : first word + /// word2 : second word + /// + + //int size1 = word1.size(), size2 = word2.size(); + + vector word1_ = utf8_split(word1); + vector word2_ = utf8_split(word2); + int size1 = word1_.size(); + int size2 = word2_.size(); + + int suppr_dist, insert_dist, subs_dist; + int* dist = new int[(size1+1)*(size2+1)]; + + for(int i=0; i