From 68216b07c8ddc2f94fbbccd1a19bf2e3c3700820 Mon Sep 17 00:00:00 2001 From: "Anders S. Christensen" Date: Thu, 25 Oct 2018 13:06:56 +0200 Subject: [PATCH 1/3] Added Kernel PCA and test for KPCA --- qml/kernels/fkpca.f90 | 126 +++++++++++++++++++++++++++++++++++++++++ qml/kernels/kernels.py | 31 ++++++++++ setup.py | 7 ++- test/test_kernels.py | 71 ++++++++++++++++++++++- 4 files changed, 232 insertions(+), 3 deletions(-) create mode 100644 qml/kernels/fkpca.f90 diff --git a/qml/kernels/fkpca.f90 b/qml/kernels/fkpca.f90 new file mode 100644 index 000000000..d0bce5b0c --- /dev/null +++ b/qml/kernels/fkpca.f90 @@ -0,0 +1,126 @@ +! MIT License +! +! Copyright (c) 2018 Anders Steen Christensen +! +! Permission is hereby granted, free of charge, to any person obtaining a copy +! of this software and associated documentation files (the "Software"), to deal +! in the Software without restriction, including without limitation the rights +! to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +! copies of the Software, and to permit persons to whom the Software is +! furnished to do so, subject to the following conditions: +! +! The above copyright notice and this permission notice shall be included in all +! copies or substantial portions of the Software. +! +! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +! AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +! OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +! SOFTWARE. + + +subroutine fkpca(k, n, centering, kpca) + + implicit none + + double precision, dimension(:,:), intent(in) :: k + integer, intent(in) :: n + logical, intent(in) :: centering + double precision, dimension(n,n), intent(out) :: kpca + + ! Eigenvalues + double precision, dimension(n) :: eigenvals + + double precision, allocatable, dimension(:) :: work + + integer :: lwork + integer :: info + + integer :: i + + double precision :: inv_n + double precision :: inv_n2 + double precision, allocatable, dimension(:) :: temp + + kpca(:,:) = k(:,:) + + ! This first part centers the matrix, + ! basically Kpca = K - G@K - K@G + G@K@G, with G = 1/n + ! It is a bit hard to follow, sry, but it is very fast + ! and requires very little memory overhead. + + if (centering) then + + inv_n = 1.0d0 / n + inv_n2 = 1.0d0 / (n*n) + + allocate(temp(n)) + temp(:) = 0.0d0 + + !$OMP PARALLEL DO + do i = 1, n + temp(i) = sum(k(i,:)) + enddo + !$OMP END PARALLEL DO + + !$OMP PARALLEL DO + do i = 1, n + kpca(i,:) = kpca(i,:) + sum(temp(:)) * inv_n2 + enddo + !$OMP END PARALLEL DO + + deallocate(temp) + + !$OMP PARALLEL DO + do i = 1, n + kpca(:,i) = kpca(:,i) - sum(k(i,:)) * inv_n + enddo + !$OMP END PARALLEL DO + + !$OMP PARALLEL DO + do i = 1, n + kpca(i,:) = kpca(i,:) - sum(k(:,i)) * inv_n + enddo + !$OMP END PARALLEL DO + + endif + + ! This 2nd part solves the eigenvalue problem with the least + ! memory intensive solver, namely DSYEV(). DSYEVD() is twice + ! as fast, but requires a lot more memory, which quickly + ! becomes prohibitive. + + ! Dry run which returns the optimal "lwork" + allocate(work(1)) + call dsyev("V", "U", n, kpca, n, eigenvals, work, -1, info) + lwork = nint(work(1)) + 1 + deallocate(work) + + ! Get eigenvectors + allocate(work(lwork)) + call dsyev("V", "U", n, kpca, n, eigenvals, work, lwork, info) + deallocate(work) + + if (info < 0) then + + write (*,*) "ERROR: The ", -info, "-th argument to DSYEV() had an illegal value." + + else if (info > 0) then + + write (*,*) "ERROR: DSYEV() failed to compute an eigenvalue." + + end if + + ! This 3rd part sorts the kernel PCA matrix such that the first PCA is kpca(1) + kpca = kpca(:,n:1:-1) + kpca = transpose(kpca) + + !$OMP PARALLEL DO + do i = 1, n + kpca(i,:) = kpca(i,:) * sqrt(eigenvals(n - i + 1)) + enddo + !$OMP END PARALLEL DO + +end subroutine fkpca diff --git a/qml/kernels/kernels.py b/qml/kernels/kernels.py index 81f53f9e0..5eafc4fe3 100644 --- a/qml/kernels/kernels.py +++ b/qml/kernels/kernels.py @@ -36,6 +36,8 @@ from .fkernels import fget_local_kernels_laplacian from .fkernels import fget_vector_kernels_gaussian, fget_vector_kernels_gaussian_symmetric +from .fkernels import fkpca + def laplacian_kernel(A, B, sigma): """ Calculates the Laplacian kernel matrix K, where :math:`K_{ij}`: @@ -351,3 +353,32 @@ def get_local_kernels_laplacian(A, B, na, nb, sigmas): nsigmas = len(sigmas) return fget_local_kernels_laplacian(A.T, B.T, na, nb, sigmas, nma, nmb, nsigmas) + + +def kpca(K, n=2, centering=True): + """ Calculates `n` first principal components for the kernel :math:`K`. + + The PCA is calculated using an OpenMP parallel Fortran routine. + + A square, symmetric kernel matrix is required. Centering of the kernel matrix + is enabled by default, although this isn't a strict requirement. + + :param K: 2D kernel matrix + :type K: numpy array + :param n: Number of kernel PCAs to return (default=2) + :type n: integer + :param centering: Whether to center the kernel matrix (default=True) + :type centering: bool + + :return: array containing the principal components + :rtype: numpy array + """ + + assert K.shape[0] == K.shape[1], "ERROR: Square matrix required for Kernel PCA." + assert np.allclose(K, K.T, atol=1e-8), "ERROR: Symmetric matrix required for Kernel PCA." + assert n <= K.shape[0], "ERROR: Requested more principal components than matrix size." + + size = K.shape[0] + pca = fkpca(K, size, centering) + + return pca[:n] diff --git a/setup.py b/setup.py index c33bbfde6..df59fecd5 100755 --- a/setup.py +++ b/setup.py @@ -47,11 +47,14 @@ ext_fkernels = Extension(name = '.kernels.fkernels', - sources = ['qml/kernels/fkernels.f90'], + sources = [ + 'qml/kernels/fkernels.f90', + 'qml/kernels/fkpca.f90', + ], extra_f90_compile_args = COMPILER_FLAGS, extra_f77_compile_args = COMPILER_FLAGS, extra_compile_args = COMPILER_FLAGS, - extra_link_args = LINKER_FLAGS, + extra_link_args = LINKER_FLAGS + MATH_LINKER_FLAGS, language = FORTRAN, f2py_options=['--quiet']) diff --git a/test/test_kernels.py b/test/test_kernels.py index 8e4ee058a..20600d514 100644 --- a/test/test_kernels.py +++ b/test/test_kernels.py @@ -23,7 +23,13 @@ from __future__ import print_function import sys +import os import numpy as np +import scipy + +import sklearn +from sklearn.decomposition import KernelPCA + import qml from qml.kernels import laplacian_kernel from qml.kernels import gaussian_kernel @@ -32,6 +38,27 @@ from qml.kernels import linear_kernel from qml.kernels import matern_kernel from qml.kernels import sargan_kernel +from qml.kernels import kpca + +def get_energies(filename): + """ Returns a dictionary with heats of formation for each xyz-file. + """ + + f = open(filename, "r") + lines = f.readlines() + f.close() + + energies = dict() + + for line in lines: + tokens = line.split() + + xyz_name = tokens[0] + hof = float(tokens[1]) + + energies[xyz_name] = hof + + return energies def test_laplacian_kernel(): @@ -136,7 +163,6 @@ def test_matern_kernel(): for metric in ("l1", "l2"): for order in (0, 1, 2): - print(metric,order) matern(metric, order) def matern(metric, order): @@ -220,9 +246,52 @@ def sargan(ngamma): # Check for symmetry: assert np.allclose(Ksymm, Ksymm.T), "Error in Sargan kernel" + +def array_nan_close(a, b): + # Compares arrays, ignoring nans + + m = np.isfinite(a) & np.isfinite(b) + return np.allclose(a[m], b[m], atol=1e-8, rtol=0.0) + + +def test_kpca(): + + test_dir = os.path.dirname(os.path.realpath(__file__)) + + # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenam + data = get_energies(test_dir + "/data/hof_qm7.txt") + + # Generate a list of qml.data.Compound() objects + mols = [] + + keys = sorted(data.keys()) + + np.random.seed(666) + np.random.shuffle(keys) + + n_mols = 100 + + for xyz_file in keys[:n_mols]: + + mol = qml.data.Compound(xyz=test_dir + "/qm7/" + xyz_file) + mol.properties = data[xyz_file] + mol.generate_bob() + mols.append(mol) + + X = np.array([mol.representation for mol in mols]) + K = laplacian_kernel(X, X, 2e5) + + pcas_qml = kpca(K, n=10) + pcas_sklearn = KernelPCA(10, eigen_solver="dense", kernel='precomputed').fit_transform(K) + + assert array_nan_close(np.abs(pcas_sklearn.T), np.abs(pcas_qml)), "Error in Kernel PCA decomposition." + + if __name__ == "__main__": + test_laplacian_kernel() test_gaussian_kernel() test_linear_kernel() test_matern_kernel() test_sargan_kernel() + test_kpca() From e5e5da28c55b6c4b7181ed985791200771d10f55 Mon Sep 17 00:00:00 2001 From: "Anders S. Christensen" Date: Thu, 25 Oct 2018 14:11:21 +0200 Subject: [PATCH 2/3] Updated kernel pca centering --- qml/kernels/fkpca.f90 | 17 +++++++++-------- test/test_kernels.py | 10 +++++----- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/qml/kernels/fkpca.f90 b/qml/kernels/fkpca.f90 index d0bce5b0c..fad9f7ab3 100644 --- a/qml/kernels/fkpca.f90 +++ b/qml/kernels/fkpca.f90 @@ -41,8 +41,8 @@ subroutine fkpca(k, n, centering, kpca) integer :: i double precision :: inv_n - double precision :: inv_n2 double precision, allocatable, dimension(:) :: temp + double precision :: temp_sum kpca(:,:) = k(:,:) @@ -54,37 +54,38 @@ subroutine fkpca(k, n, centering, kpca) if (centering) then inv_n = 1.0d0 / n - inv_n2 = 1.0d0 / (n*n) allocate(temp(n)) temp(:) = 0.0d0 !$OMP PARALLEL DO do i = 1, n - temp(i) = sum(k(i,:)) + temp(i) = sum(k(i,:)) * inv_n enddo !$OMP END PARALLEL DO + temp_sum = sum(temp(:)) * inv_n + !$OMP PARALLEL DO do i = 1, n - kpca(i,:) = kpca(i,:) + sum(temp(:)) * inv_n2 + kpca(i,:) = kpca(i,:) + temp_sum enddo !$OMP END PARALLEL DO - deallocate(temp) - !$OMP PARALLEL DO do i = 1, n - kpca(:,i) = kpca(:,i) - sum(k(i,:)) * inv_n + kpca(:,i) = kpca(:,i) - temp(i) enddo !$OMP END PARALLEL DO !$OMP PARALLEL DO do i = 1, n - kpca(i,:) = kpca(i,:) - sum(k(:,i)) * inv_n + kpca(i,:) = kpca(i,:) - temp(i) enddo !$OMP END PARALLEL DO + deallocate(temp) + endif ! This 2nd part solves the eigenvalue problem with the least diff --git a/test/test_kernels.py b/test/test_kernels.py index 20600d514..735a5c045 100644 --- a/test/test_kernels.py +++ b/test/test_kernels.py @@ -289,9 +289,9 @@ def test_kpca(): if __name__ == "__main__": - test_laplacian_kernel() - test_gaussian_kernel() - test_linear_kernel() - test_matern_kernel() - test_sargan_kernel() + # test_laplacian_kernel() + # test_gaussian_kernel() + # test_linear_kernel() + # test_matern_kernel() + # test_sargan_kernel() test_kpca() From 8f01eb15d42292ea1b644b9fc2eceba49b5a959d Mon Sep 17 00:00:00 2001 From: andersx Date: Mon, 21 Jan 2019 17:25:18 +0100 Subject: [PATCH 3/3] Updated test_kernels.py --- test/test_kernels.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/test_kernels.py b/test/test_kernels.py index 735a5c045..20600d514 100644 --- a/test/test_kernels.py +++ b/test/test_kernels.py @@ -289,9 +289,9 @@ def test_kpca(): if __name__ == "__main__": - # test_laplacian_kernel() - # test_gaussian_kernel() - # test_linear_kernel() - # test_matern_kernel() - # test_sargan_kernel() + test_laplacian_kernel() + test_gaussian_kernel() + test_linear_kernel() + test_matern_kernel() + test_sargan_kernel() test_kpca()