Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on: [push]

jobs:
build:
runs-on: ubuntu-20.04
runs-on: ubuntu-24.04
strategy:
matrix:
python-version: ["3.10"]
Expand Down
31 changes: 0 additions & 31 deletions lkmeans/examples/decomposition.py

This file was deleted.

74 changes: 62 additions & 12 deletions lkmeans/examples/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import time
from collections import defaultdict
from enum import Enum
from typing import Dict, Optional
from typing import Dict, Optional, Tuple

import numpy as np
from numpy.typing import NDArray
from sklearn import datasets
from sklearn.metrics import (accuracy_score, adjusted_mutual_info_score, adjusted_rand_score, completeness_score,
homogeneity_score, normalized_mutual_info_score, v_measure_score)
from tap import Tap
Expand All @@ -25,10 +26,20 @@ class ClusteringAlgorithmType(Enum):
HARD_SEMI_SUPERVISED_LKMEANS = 'hard_semi_supervised_lkmeans'


class DataType(Enum):
GENERATED = 'generated'
WINE = 'wine'
BREAST_CANCER = 'breast_cancer'
IRIS = 'iris'
DIGITS = 'digits'
MNIST = 'mnist'
CIFAR10 = "cifar10"


class ExperimentArguments(Tap):
minkowski_parameter: float
t_parameter: float
n_points: int
t_parameter: Optional[float] = None
n_points: Optional[int] = None
clustering_algorithm: ClusteringAlgorithmType = ClusteringAlgorithmType.LKMEANS
self_supervised_preprocessor_algorithm: Optional[PreprocessorType] = None
self_supervised_components: int = 2
Expand All @@ -38,6 +49,14 @@ class ExperimentArguments(Tap):
repeats: int = 10
supervision_ratio: float = 0

dataset: DataType = DataType.GENERATED

def validate_args(args: ExperimentArguments) -> None:
if args.dataset is DataType.GENERATED and args.t_parameter is None:
raise ValueError(f"Specify {args.t_parameter}")
if args.dataset is DataType.GENERATED and args.n_points is None:
raise ValueError(f"Specify {args.n_points}")


def get_clustering_algorithm(clustering_type: ClusteringAlgorithmType) -> Clustering:
clustering_map: Dict[ClusteringAlgorithmType, Clustering] = {
Expand All @@ -60,24 +79,55 @@ def calculate_metrics(labels: NDArray, generated_labels: NDArray) -> Dict[str, f
}


def generate_data(args: ExperimentArguments) -> Tuple[NDArray, NDArray]:
if args.dataset is DataType.GENERATED:
_, prob, mu_list, cov_matrices = get_experiment_data(args.num_clusters, args.dimension)

data, labels, _ = generate_mix_distribution(
probability=prob,
mu_list=mu_list,
cov_matrices=cov_matrices,
n_samples=args.n_points,
t=args.t_parameter
)
elif args.dataset is DataType.WINE:
data, labels = datasets.load_wine(return_X_y=True)
elif args.dataset is DataType.BREAST_CANCER:
data, labels = datasets.load_breast_cancer(return_X_y=True)
elif args.dataset is DataType.IRIS:
data, labels = datasets.load_iris(return_X_y=True)
elif args.dataset is DataType.DIGITS:
data, labels = datasets.load_digits(return_X_y=True)

elif args.dataset is DataType.MNIST:
data, labels = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
labels = labels.astype(int)
elif args.dataset is DataType.CIFAR10:
data, labels = datasets.fetch_openml('CIFAR_10_small', version=1, return_X_y=True)
labels = labels.astype(int)
else:
raise ValueError("Not supported dataset")

num_clusters_in_dataset = len(set(labels))
if args.num_clusters != num_clusters_in_dataset:
print(f"Warning: {args.dataset} has {num_clusters_in_dataset} clusters",
f"while num_clusters = {args.num_clusters} is passed.",
f"Changed the num_clusters to {num_clusters_in_dataset}")
args.num_clusters = num_clusters_in_dataset
return data, labels


def main() -> None:
args = ExperimentArguments(underscores_to_dashes=True).parse_args()

_, prob, mu_list, cov_matrices = get_experiment_data(args.num_clusters, args.dimension)
validate_args(args)

clustering = get_clustering_algorithm(args.clustering_algorithm)

average_result = defaultdict(list)

for _ in range(args.repeats):

clusters, labels, _ = generate_mix_distribution(
probability=prob,
mu_list=mu_list,
cov_matrices=cov_matrices,
n_samples=args.n_points,
t=args.t_parameter
)
clusters, labels = generate_data(args)

if args.self_supervised_preprocessor_algorithm is not None:
self_supervised_parameters = PreprocessorParameters(n_components=args.self_supervised_components)
Expand Down
52 changes: 52 additions & 0 deletions lkmeans/examples/scripts/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Instruction how to run benchmarking


## Unsupervised clustering

### Generated data
1. Run benchmarking
```bash
bash ./lkmeans/examples/scripts/runner_unsupervised.sh
```

1. Make SLURM job for benchmarking
```bash
sbatch ./lkmeans/examples/scripts/sbatch_unsupervised.sh
```

## Semi-Supervised clustering

### Generated data
1. Run benchmarking
```bash
bash ./lkmeans/examples/scripts/runner_semi_supervised.sh
```

1. Make SLURM job for benchmarking
```bash
sbatch ./lkmeans/examples/scripts/sbatch_semi_supervised.sh
```

## Self-Supervised clustering

### Generated data
1. Run benchmarking
```bash
bash ./lkmeans/examples/scripts/runner_self_supervised.sh
```

1. Make SLURM job for benchmarking
```bash
sbatch ./lkmeans/examples/scripts/sbatch_self_supervised.sh
```

### Real data
1. Run benchmarking
```bash
bash ./lkmeans/examples/scripts/runner_self_supervised_real_data.sh
```

1. Make SLURM job for benchmarking
```bash
sbatch ./lkmeans/examples/scripts/sbatch_self_supervised_real_data.sh
```
42 changes: 42 additions & 0 deletions lkmeans/examples/scripts/runner_self_supervised_real_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#! bin/bash

LOGDIR=experiments_data/logs_super/detailed_self_supervised_clustering_real_data_1000
set -ex

mkdir -p ${LOGDIR}

# VALUES
DATASET=$1
MINKOSKI_VALUES=(2)
# PREPROCESSOR_VALUES=(pca spectral_embeddings locally_linear_embeddings mds isomap umap)
PREPROCESS_COMPONENT=$2
PREPROCESSOR=$3

# Constants
CLUSTERING=lkmeans
REPEATS=100

for MINKOVSKI in "${MINKOSKI_VALUES[@]}";do
# for PREPROCESSOR in "${PREPROCESSOR_VALUES[@]}";do
# for DATASET in "${DATASETS[@]}";do
# for PREPROCESS_COMPONENT in "${PREPROCESS_COMPONENTS[@]}";do


NAME="${CLUSTERING}_|_self_supervision_${PREPROCESSOR}_|_self_supervision_n_components_${PREPROCESS_COMPONENT}_|_dataset_${DATASET}_|_minkowski_${MINKOVSKI}_|_repeats_${REPEATS}.log"

echo ${LOGDIR}/${NAME}
PARAMETERS="
--dataset ${DATASET} \
--minkowski-parameter ${MINKOVSKI} \
--clustering-algorithm ${CLUSTERING} \
--repeats ${REPEATS} \
--self-supervised-preprocessor-algorithm ${PREPROCESSOR} \
--self-supervised-components ${PREPROCESS_COMPONENT} \
"
python lkmeans/examples/main.py ${PARAMETERS} &> ${LOGDIR}/${NAME}
# done
# done
# done
done

echo DONE
31 changes: 31 additions & 0 deletions lkmeans/examples/scripts/runner_unsupervised_real_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#! bin/bash

LOGDIR=experiments_data/logs_super/unsupervised_clustering_real_data_1000
set -ex

mkdir -p ${LOGDIR}

# VALUES
MINKOSKI_VALUES=(2)

# Constants
CLUSTERING=lkmeans
REPEATS=100

DATASET=$1
# for DATASET in "${DATASETS[@]}";do
for MINKOVSKI in "${MINKOSKI_VALUES[@]}";do

NAME="${CLUSTERING}_|_num-clusters_${NUM_CLUSTERS}_|_dataset_${DATASET}_|_minkowski_${MINKOVSKI}__|_repeats_${REPEATS}.log"

echo ${NAME}
PARAMETERS="
--dataset ${DATASET} \
--minkowski-parameter ${MINKOVSKI} \
--clustering-algorithm ${CLUSTERING} \
--repeats ${REPEATS} \
"
python lkmeans/examples/main.py ${PARAMETERS} &> ${LOGDIR}/${NAME}
done
# done

17 changes: 17 additions & 0 deletions lkmeans/examples/scripts/sbatch_self_supervised_real_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#! /bin/bash
#SBATCH --job-name="self_supervised_clustering"
#SBATCH --output=%x_%j.out

#SBATCH --gpus=0
#SBATCH --time=0-2:0

module purge
module load Python

source deactivate
source activate lkmeans_venv

# Executable
export PYTHONPATH=${PYTHONPATH}:$(pwd)

srun bash ./lkmeans/examples/scripts/runner_self_supervised_real_data.sh $1 $2 $3
17 changes: 17 additions & 0 deletions lkmeans/examples/scripts/sbatch_unsupervised_real_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#! /bin/bash
#SBATCH --job-name="unsupervised_clustering"
#SBATCH --output=%x_%j.out

#SBATCH --gpus=0
#SBATCH --time=10-0:0

module purge
module load Python

source deactivate
source activate lkmeans_venv

# Executable
export PYTHONPATH=${PYTHONPATH}:$(pwd)

srun bash ./lkmeans/examples/scripts/runner_unsupervised_real_data.sh $1
2 changes: 1 addition & 1 deletion lkmeans/examples/scripts/super_runner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ PREPROCESS_COMPONENTS=(19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2)

for T in "${T_VALUES[@]}";do
for PREPROCESS_COMPONENT in "${PREPROCESS_COMPONENTS[@]}";do
sbatch -A proj_1538 ./lkmeans/examples/scripts/sbatch_self_supervised.sh ${T} ${PREPROCESS_COMPONENT}
sbatch ./lkmeans/examples/scripts/sbatch_self_supervised.sh ${T} ${PREPROCESS_COMPONENT}
done
done
16 changes: 16 additions & 0 deletions lkmeans/examples/scripts/super_runner_real_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
DATASETS=(wine breast_cancer iris digits mnist cifar10)
PREPROCESS_COMPONENTS=(19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2)
PREPROCESSOR_VALUES=(pca spectral_embeddings locally_linear_embeddings mds isomap umap)

for DATASET in "${DATASETS[@]}";do
for PREPROCESS_COMPONENT in "${PREPROCESS_COMPONENTS[@]}";do
for PREPROCESSOR in "${PREPROCESSOR_VALUES[@]}";do
sbatch ./lkmeans/examples/scripts/sbatch_self_supervised_real_data.sh ${DATASET} ${PREPROCESS_COMPONENT} ${PREPROCESSOR}
done
done
done


# for DATASET in "${DATASETS[@]}";do
# sbatch ./lkmeans/examples/scripts/sbatch_unsupervised_real_data.sh ${DATASET}
# done
Loading