Bonjour
J'écris une bibliothèque pour mettre en œuvre la méthode DBSCAN, qui permet de regrouper des données en clusters. La méthode fonctionne bien, je l'ai testée avec un sketch d'exemple.
J'ai ensuite ajouté des fonctions pour calculer certaines caractéristiques de mes clusters : centroïde et homogénéité. Et la compilation me donne une erreur inattendue :
Arduino : 1.8.12 (Windows 10), Carte : "ESP32 Dev Module, Disabled, Default 4MB with spiffs (1.2MB APP/1.5MB SPIFFS), 240MHz (WiFi/BT), QIO, 80MHz, 4MB (32Mb), 921600, None"
C:\Users***\Documents\Arduino\libraries\DBSCAN\DBSCAN.cpp: In function 'std::vector computeCentroid(uint16_t, const std::vector&)':
C:\Users***\Documents\Arduino\libraries\DBSCAN\DBSCAN.cpp:89:19: error: '_dataset' was not declared in this scope
centroid[k] += _dataset[cluster[j]][k] / cluster.size(); ^
C:\Users***\Documents\Arduino\libraries\DBSCAN\DBSCAN.cpp: In function 'float computeTightness(uint16_t, const std::vector&, const std::vector&)':
C:\Users***\Documents\Arduino\libraries\DBSCAN\DBSCAN.cpp:100:25: error: '_dataset' was not declared in this scope
tightness += distance(_dataset[cluster[j]], centroid) / cluster.size();
^
exit status 1
Erreur de compilation pour la carte ESP32 Dev Module
Je ne comprends pas pourquoi la variable _dataset
(un vecteur 2D) est inconnue dans ces deux fonctions alors qu'elle ne pose aucun problème dans les autres fonctions.
Voici les fichiers :
cpp :
#include "DBSCAN.h"
/* Constructor
Arguments :
epsilon : maximum distance between neighbours
minPts : minimum number of points in a cluster
type : the type of distance
mink : the exponent in case of the Minkovski distance (optionnal)
*/
dbscan::dbscan(float epsilon, int minPts, uint8_t distance, float mink)
{
_epsilon = epsilon;
_minPts = minPts;
_distanceType = distance;
_mink = mink;
}
dbscan::~dbscan() {
}
/* Process the dataset */
void dbscan::init(std::vector<std::vector<float>> const &dataset)
{
_nData = dataset.size();
for (uint16_t i=0; i < _nData; ++i) {
_dataset.push_back(dataset[i]);
_type.push_back(NOT_VISITED);
}
// Average distance
float averageDistance = 0.0f;
for (uint16_t i = 0; i < _nData; ++i)
for (uint16_t j = i + 1; j < _nData; ++j)
averageDistance += distance(_dataset[i], _dataset[j]);
averageDistance /= ((_nData - 1) * _nData / 2);
Serial.printf ("Average distance : %f\n", averageDistance);
// Process dataset
for (uint16_t i = 0; i < _nData; ++i) {
if (_type[i] == NOT_VISITED) {
_type[i] = VISITED;
std::vector<uint16_t> currentCluster;
std::vector<uint16_t> neighbours = findNeighbours(i);
// If the point has too few neighbours : set to noise
if (neighbours.size() < _minPts) {
_type[i] = NOISE;
++_nNoise;
// Serial.println ("Noise!");
} else {
// create a cluster with this point
currentCluster.push_back(i);
enlargeCluster (neighbours, currentCluster);
// Mark all points in the cluster as VISITED
for (uint16_t j = 0; j < currentCluster.size(); ++j)
_type[currentCluster[j]] = VISITED;
// Add current cluster to clusters list
_clusters.push_back(currentCluster);
++_nClusters;
}
}
}
// Print statistics about the clusters
uint16_t nFeatures = _dataset[_clusters[0][0]].size();
Serial.printf ("Created %d clusters.\n", _nClusters);
for (uint16_t i = 0; i < _nClusters; ++i) {
Serial.printf ("Cluster %d : %d points\n",i, _clusters[i].size() - 1);
// Centroid
std::vector<float> centroid;
centroid = computeCentroid(nFeatures, _clusters[i]);
Serial.print("\tCentroid: ");
for (uint16_t k = 0; k < nFeatures; ++k)
Serial.printf("%f ",centroid[k]);
Serial.println();
// Tightness (mean distance to centroid)
Serial.printf ("\tTightness = %.3f\n",computeTightness(nFeatures, _clusters[i], centroid));
}
Serial.printf("%d noise points\n", _nNoise);
}
/* Compute the coordinates of the centroid of a cluster */
std::vector<float> computeCentroid (uint16_t nFeatures, std::vector<uint16_t> const &cluster)
{
std::vector<float> centroid(nFeatures, 0);
for (uint16_t j = 0; j < cluster.size(); ++j) {
for (uint16_t k = 0; k < nFeatures; ++k) {
centroid[k] += _dataset[cluster[j]][k] / cluster.size();
}
}
return centroid;
}
/* Compute the tightness of a cluster */
float computeTightness (uint16_t nFeatures, std::vector<uint16_t> const &cluster, std::vector<float> const ¢roid)
{
float tightness = 0.0f;
for (uint16_t j = 0; j < cluster.size(); ++j)
tightness += distance(_dataset[cluster[j]], centroid) / cluster.size();
return tightness;
}
/* Enlarge an existing cluster */
void dbscan::enlargeCluster (std::vector<uint16_t> neighbours, std::vector<uint16_t> ¤tCluster)
{
uint16_t i = 0;
while (i < neighbours.size()) {
uint16_t index = neighbours[i++];
if (_type[index] == NOT_VISITED) {
std::vector<uint16_t> neighbours2 = findNeighbours(index);
if (neighbours2.size() > _minPts) {
// make union of both neighbourhoods
for (uint16_t j = 0; j < neighbours2.size(); ++j) {
bool isInNeighbours = false;
for (uint16_t k = 0; k < neighbours.size(); ++k) {
if (neighbours2[j] == neighbours[k]) {
isInNeighbours = true;
break;
}
}
if (!isInNeighbours) neighbours.push_back(neighbours2[j]);
}
}
}
// add current point to current cluster is not already part of a cluster
bool isInCluster = false;
for (uint16_t j = 0; j < _nClusters - 1; ++j)
for (uint16_t k = 0; k < _clusters[j].size(); ++k)
if (_clusters[j][k] == index) {
isInCluster = true;
break;
}
if (!isInCluster) currentCluster.push_back(index);
}
}
/* Find the neighbours of a point in the dataset */
std::vector<uint16_t> dbscan::findNeighbours (uint16_t n)
{
std::vector<uint16_t> neighbours;
for (uint16_t i=0; i < _nData; ++i)
if (isNeighbour(_dataset[n], _dataset[i])) neighbours.push_back(i);
return neighbours;
}
/*
Compute the distance between 2 vectors
*/
float dbscan::distance(std::vector<float> const &vector1, std::vector<float> const &vector2)
{
if (vector1.size() != vector2.size()) {
Serial.println("Vector size problem !");
return 1.0e10;
}
float distance = 0.0f;
switch (_distanceType) {
case EUCLIDIAN:
for (uint8_t i=0; i<vector1.size(); ++i) distance += pow(vector1[i] - vector2[i], 2);
distance = sqrt(distance);
break;
case MINKOVSKI:
for (uint8_t i=0; i<vector1.size(); ++i)
distance += pow(abs(vector1[i] - vector2[i]), _mink);
distance = pow(distance, 1./_mink);
break;
case MANHATTAN:
for (uint8_t i=0; i<vector1.size(); ++i) distance += abs(vector1[i] - vector2[i]);
break;
case CHEBYCHEV:
for (uint8_t i=0; i<vector1.size(); ++i) distance += max(distance, abs(vector1[i] - vector2[i]));
break;
case CANBERRA:
for (uint8_t i=0; i<vector1.size(); ++i)
distance += abs(vector1[i] - vector2[i]) / (abs(vector1[i]) + abs(vector2[i]));
break;
default:
Serial.println ("Distance type problem !");
distance = 2.0e10;
}
return distance;
}
int dbscan::countNeighbours(std::vector<float> const &vector)
{
int neighbours = 0.0;
for (uint8_t i=0; i < _nData; ++i)
if (isNeighbour(vector, _dataset[i])) ++neighbours;
return neighbours;
}
bool dbscan::isNeighbour(std::vector<float> const &vector1, std::vector<float> const &vector2)
{
return (distance(vector1, vector2) <= _epsilon);
}
.h :
/*
DBSCAN unsupervised classification algorithm
inspired by https://penseeartificielle.fr/clustering-avec-lalgorithme-dbscan/
https://openclassrooms.com/fr/courses/4379436-explorez-vos-donnees-avec-des-algorithmes-non-supervises/4379571-partitionnez-vos-donnees-avec-dbscan
(c) 2021 Lesept
contact: lesept777@gmail.com
*/
#ifndef dbscan_h
#define dbscan_h
#include <Arduino.h>
enum DISTANCE {
EUCLIDIAN, // euclidian distance
MINKOVSKI, // Minkowski distance (is Euclidian if param = 2)
MANHATTAN, // Manhattan distance
CHEBYCHEV, // Chebychev distance
CANBERRA // Canberra distance
};
enum TYPE {
NOT_VISITED,
VISITED,
NOISE
};
class dbscan
{
private:
float _epsilon = 2.0f;
float _mink = 1.0f;
uint16_t _minPts;
uint16_t _nNoise = 0;
uint8_t _distanceType = 0;
uint16_t _nData = 0;
uint16_t _nClusters = 0;
std::vector<uint8_t> _type;
std::vector<std::vector<float>> _dataset;
std::vector<std::vector<uint16_t>> _clusters;
public:
dbscan (float, int, uint8_t, float = 1.0f);
~dbscan ();
std::vector<float> computeCentroid (uint16_t, std::vector<uint16_t> const &);
float computeTightness (uint16_t, std::vector<uint16_t> const &, std::vector<float> const &);
void init (std::vector<std::vector<float>> const &);
float distance (std::vector<float> const &, std::vector<float> const &);
int countNeighbours (std::vector<float> const &);
bool isNeighbour (std::vector<float> const &, std::vector<float> const &);
std::vector<uint16_t> findNeighbours (uint16_t);
void enlargeCluster (std::vector<uint16_t>, std::vector<uint16_t> &);
};
#endif
Merci de votre aide