diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..1ff0c42 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,63 @@ +############################################################################### +# Set default behavior to automatically normalize line endings. +############################################################################### +* text=auto + +############################################################################### +# Set default behavior for command prompt diff. +# +# This is need for earlier builds of msysgit that does not have it on by +# default for csharp files. +# Note: This is only used by command line +############################################################################### +#*.cs diff=csharp + +############################################################################### +# Set the merge driver for project and solution files +# +# Merging from the command prompt will add diff markers to the files if there +# are conflicts (Merging from VS is not affected by the settings below, in VS +# the diff markers are never inserted). Diff markers may cause the following +# file extensions to fail to load in VS. An alternative would be to treat +# these files as binary and thus will always conflict and require user +# intervention with every merge. To do so, just uncomment the entries below +############################################################################### +#*.sln merge=binary +#*.csproj merge=binary +#*.vbproj merge=binary +#*.vcxproj merge=binary +#*.vcproj merge=binary +#*.dbproj merge=binary +#*.fsproj merge=binary +#*.lsproj merge=binary +#*.wixproj merge=binary +#*.modelproj merge=binary +#*.sqlproj merge=binary +#*.wwaproj merge=binary + +############################################################################### +# behavior for image files +# +# image files are treated as binary by default. +############################################################################### +#*.jpg binary +#*.png binary +#*.gif binary + +############################################################################### +# diff behavior for common document formats +# +# Convert binary document formats to text before diffing them. This feature +# is only available from the command line. Turn it on by uncommenting the +# entries below. +############################################################################### +#*.doc diff=astextplain +#*.DOC diff=astextplain +#*.docx diff=astextplain +#*.DOCX diff=astextplain +#*.dot diff=astextplain +#*.DOT diff=astextplain +#*.pdf diff=astextplain +#*.PDF diff=astextplain +#*.rtf diff=astextplain +#*.RTF diff=astextplain diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3c4efe2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,261 @@ +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. + +# User-specific files +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ + +# Visual Studio 2015 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUNIT +*.VisualState.xml +TestResult.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# DNX +project.lock.json +project.fragment.lock.json +artifacts/ + +*_i.c +*_p.c +*_i.h +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# JustCode is a .NET coding add-in +.JustCode + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# TODO: Comment the next line if you want to checkin your web deploy settings +# but database connection strings (with potential passwords) will be unencrypted +#*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# The packages folder can be ignored because of Package Restore +**/packages/* +# except build/, which is used as an MSBuild target. +!**/packages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/packages/repositories.config +# NuGet v3's project.json files produces more ignoreable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +node_modules/ +orleans.codegen.cs + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm + +# SQL Server files +*.mdf +*.ldf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# JetBrains Rider +.idea/ +*.sln.iml + +# CodeRush +.cr/ + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc \ No newline at end of file diff --git a/SimpleKNN/SimpleKNN.sln b/SimpleKNN/SimpleKNN.sln new file mode 100644 index 0000000..5b75e23 --- /dev/null +++ b/SimpleKNN/SimpleKNN.sln @@ -0,0 +1,31 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.421 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SimpleKNN", "SimpleKNN\SimpleKNN.vcxproj", "{7E167104-A9C0-49A3-AA16-00443A7AD47B}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {7E167104-A9C0-49A3-AA16-00443A7AD47B}.Debug|x64.ActiveCfg = Debug|x64 + {7E167104-A9C0-49A3-AA16-00443A7AD47B}.Debug|x64.Build.0 = Debug|x64 + {7E167104-A9C0-49A3-AA16-00443A7AD47B}.Debug|x86.ActiveCfg = Debug|Win32 + {7E167104-A9C0-49A3-AA16-00443A7AD47B}.Debug|x86.Build.0 = Debug|Win32 + {7E167104-A9C0-49A3-AA16-00443A7AD47B}.Release|x64.ActiveCfg = Release|x64 + {7E167104-A9C0-49A3-AA16-00443A7AD47B}.Release|x64.Build.0 = Release|x64 + {7E167104-A9C0-49A3-AA16-00443A7AD47B}.Release|x86.ActiveCfg = Release|Win32 + {7E167104-A9C0-49A3-AA16-00443A7AD47B}.Release|x86.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {2E5D3070-3A67-48A5-9B6F-4AC486B9968E} + EndGlobalSection +EndGlobal diff --git a/SimpleKNN/SimpleKNN/CategorizedSamples.txt b/SimpleKNN/SimpleKNN/CategorizedSamples.txt new file mode 100644 index 0000000..95918c7 --- /dev/null +++ b/SimpleKNN/SimpleKNN/CategorizedSamples.txt @@ -0,0 +1,9 @@ +categorized +6 +2 +0,1,1 +0,2,1 +0,2,2 +1,7,5 +1,7,6 +1,8,5 \ No newline at end of file diff --git a/SimpleKNN/SimpleKNN/Main.c b/SimpleKNN/SimpleKNN/Main.c new file mode 100644 index 0000000..e79f4ce --- /dev/null +++ b/SimpleKNN/SimpleKNN/Main.c @@ -0,0 +1,269 @@ +#define _CRT_SECURE_NO_WARNINGS + +/* --- --- --- --- --- --- --- --- --- --- */ + +#include +#include +#include +#include +#include + +/* --- --- --- --- --- --- --- --- --- --- */ + +#define DATATYPE double + +/* --- --- --- --- --- --- --- --- --- --- */ + +struct sample +{ + DATATYPE * dim; + uint32_t group; + DATATYPE tmp_distance; +}; + +struct knn_data +{ + uint32_t k; + struct sample ** best_voters; + struct sample * samples[2]; + uint32_t samples_count[2]; + uint32_t samples_dimensions[2]; +}; + +/* --- --- --- --- --- --- --- --- --- --- */ + +void parse_string_to_sample(struct sample *, char *, uint32_t, uint8_t); +void parse_file_to_samples(struct knn_data *, char *); +void parse_samples_to_file(struct knn_data *, char *); +void knn_algorithm(struct knn_data *); + +/* --- --- --- --- --- --- --- --- --- --- */ + +int main() +{ + char tmp_str[256]; + + struct knn_data knn; + + printf("Set number of voters (k=):"); + scanf("%d", &knn.k); + + knn.best_voters = (struct samples **) malloc(knn.k * sizeof(struct samples *)); + + printf("Provide categorized samples file (train data):"); + scanf("%s", tmp_str); + + parse_file_to_samples(&knn, tmp_str); + + printf("Provide uncategorized samples file (new data):"); + scanf("%s", tmp_str); + + parse_file_to_samples(&knn, tmp_str); + + printf("Perform k-nn algorithm\n"); + knn_algorithm(&knn); + + printf("Do you want to save the output (yes/no)?"); + scanf("%s", tmp_str); + + if (strcmp(tmp_str, "yes") == 0 || strcmp(tmp_str, "y") == 0) + { + printf("Where do you want to save the newly categorized data (filepath)?"); + scanf("%s", tmp_str); + + parse_samples_to_file(&knn, tmp_str); + printf("Completed.\n"); + } + + printf("Training Samples:\n"); + + for (int i = 0; i < knn.samples_count[0]; i++) + { + printf("Sample %d -", i); + + for (int j = 0; j < knn.samples_dimensions[0]; j++) + printf(" %f", (knn.samples[0] + i)->dim[j]); + + printf(" | %d\n", (knn.samples[0] + i)->group); + } + + printf("New Categorized Samples:\n"); + + for (int i = 0; i < knn.samples_count[1]; i++) + { + printf("Sample %d -", i); + + for (int j = 0; j < knn.samples_dimensions[1]; j++) + printf(" %f", (knn.samples[1] + i)->dim[j]); + + printf(" | %d\n", (knn.samples[1] + i)->group); + } + + return 0; +} + +/* --- --- --- --- --- --- --- --- --- --- */ + +void parse_string_to_sample(struct sample * sample, char * string, uint32_t max_dimensions, + uint8_t has_group) +{ + int tmp_cnt = has_group == 0 ? 0 : 1; + + char * tmp_ptr = strtok(string, ","); + + if (has_group == 0) + sample->group = atoi(tmp_ptr); + + else + sample->dim[0] = atof(tmp_ptr); + + while ((tmp_ptr = strtok(NULL, ",")) != NULL) + sample->dim[tmp_cnt++] = atof(tmp_ptr); +} + +/* --- --- --- --- --- --- --- --- --- --- */ + +void parse_file_to_samples(struct knn_data * knn, char * filepath) +{ + int tmp_cnt; + + char line[256]; + + FILE *file_pointer; + + file_pointer = fopen(filepath, "r"); + + fgets(line, 128, file_pointer); + + tmp_cnt = strstr(line, "uncategorized") == NULL ? 0 : 1; + + fgets(line, 128, file_pointer); + + knn->samples_count[tmp_cnt] = atoi(line); + + knn->samples[tmp_cnt] = + (struct sample *) malloc(knn->samples_count[tmp_cnt] * sizeof(struct sample)); + + fgets(line, 128, file_pointer); + + knn->samples_dimensions[tmp_cnt] = atoi(line); + + for (uint32_t i = 0; i < knn->samples_count[tmp_cnt]; i++) + { + (knn->samples[tmp_cnt] + i)->dim = + (DATATYPE *)malloc(knn->samples_dimensions[tmp_cnt] * sizeof(DATATYPE)); + + fgets(line, 128, file_pointer); + + parse_string_to_sample(knn->samples[tmp_cnt] + i, line, + knn->samples_dimensions[tmp_cnt], tmp_cnt); + } +} + +/* --- --- --- --- --- --- --- --- --- --- */ + +void parse_samples_to_file(struct knn_data * knn, char * filepath) +{ + printf(":) not working yet..."); +} + +/* --- --- --- --- --- --- --- --- --- --- */ + + +void knn_algorithms_sort_asc_voters(struct knn_data * knn) +{ + struct sample * tmp_smpl = NULL; + + for (int i = 0; i < knn->k; i++) + for (int j = 0; j < knn->k - 1; j++) + if (knn->best_voters[j]->tmp_distance > knn->best_voters[j + 1]->tmp_distance) + { + tmp_smpl = knn->best_voters[j]; + knn->best_voters[j] = knn->best_voters[j + 1]; + knn->best_voters[j + 1] = tmp_smpl; + } +} + +void knn_algorithm(struct knn_data * knn) +{ + double euclidean_distance; + + uint32_t * most_common[2], selected_group_pos; + + most_common[0] = (uint32_t *)malloc(knn->k * sizeof(uint32_t)); + most_common[1] = (uint32_t *)malloc(knn->k * sizeof(uint32_t)); + + for (int i = 0; i < knn->samples_count[1]; i++) + { + for (int q = 0; q < knn->k; q++) + knn->best_voters[q] = NULL; + + for (int j = 0; j < knn->samples_count[0]; j++) + { + euclidean_distance = 0; + + for (int q = 0; q < knn->samples_dimensions[0]; q++) + euclidean_distance += pow( + (knn->samples[0] + j)->dim[q] - (knn->samples[1] + i)->dim[q] + , 2); + + (knn->samples[0] + j)->tmp_distance = sqrt(euclidean_distance); + + if (j < knn->k) + { + knn->best_voters[j] = (knn->samples[0] + j); + } + else + { + if (j == knn->k) + knn_algorithms_sort_asc_voters(knn); + + for (int q = 0; q < knn->k; q++) + if (knn->best_voters[q]->tmp_distance > (knn->samples[0] + j)->tmp_distance) + { + for (int z = knn->k - 1; z >= q + 1; z--) + knn->best_voters[z] = knn->best_voters[z - 1]; + + knn->best_voters[q] = (knn->samples[0] + j); + + break; + } + } + } + + memset(most_common[0], 0, knn->k * sizeof(uint32_t)); + memset(most_common[1], 0, knn->k * sizeof(uint32_t)); + + for (int j = 0; j < knn->k; j++) + { + for (int q = 0; q < knn->k; q++) + + if (*(most_common[0] + q) == knn->best_voters[j]->group) + { + *(most_common[1] + q) += 1; + break; + } + else if (*(most_common[0] + q) == 0) + { + *(most_common[0] + q) = knn->best_voters[j]->group; + *(most_common[1] + q) += 1; + break; + } + } + + selected_group_pos = 0; + + for (int j = 1; j < knn->k; j++) + { + if (*(most_common[1] + j) == 0) + break; + + if (*(most_common[0] + j) > *(most_common[0] + selected_group_pos)) + selected_group_pos = j; + } + + (knn->samples[1] + i)->group = *(most_common[0] + selected_group_pos); + } +} + +/* --- --- --- --- --- --- --- --- --- --- */ diff --git a/SimpleKNN/SimpleKNN/SimpleKNN.vcxproj b/SimpleKNN/SimpleKNN/SimpleKNN.vcxproj new file mode 100644 index 0000000..984ef7e --- /dev/null +++ b/SimpleKNN/SimpleKNN/SimpleKNN.vcxproj @@ -0,0 +1,127 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {7E167104-A9C0-49A3-AA16-00443A7AD47B} + SimpleKNN + 10.0.17763.0 + + + + Application + true + v141 + MultiByte + + + Application + false + v141 + true + MultiByte + + + Application + true + v141 + MultiByte + + + Application + false + v141 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + + + Level3 + Disabled + true + true + + + + + Level3 + Disabled + true + true + + + + + Level3 + MaxSpeed + true + true + true + true + + + true + true + + + + + Level3 + MaxSpeed + true + true + true + true + + + true + true + + + + + + + + + + + + + \ No newline at end of file diff --git a/SimpleKNN/SimpleKNN/SimpleKNN.vcxproj.filters b/SimpleKNN/SimpleKNN/SimpleKNN.vcxproj.filters new file mode 100644 index 0000000..5b48417 --- /dev/null +++ b/SimpleKNN/SimpleKNN/SimpleKNN.vcxproj.filters @@ -0,0 +1,30 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + + + Resource Files + + + Resource Files + + + \ No newline at end of file diff --git a/SimpleKNN/SimpleKNN/UncategorizedSamples.txt b/SimpleKNN/SimpleKNN/UncategorizedSamples.txt new file mode 100644 index 0000000..df43b47 --- /dev/null +++ b/SimpleKNN/SimpleKNN/UncategorizedSamples.txt @@ -0,0 +1,8 @@ +uncategorized +5 +2 +9,6 +1,2 +3,1 +6,6 +3,3 \ No newline at end of file