#　配列の属性取得処理をおこなう関数のソースコード部
c_src=r'''
#include <nlcpy.h>

void ve_get_a(ve_array *x, ve_array *y,
              ve_array *x_attributes,
              ve_array *y_attributes) {

    /* 配列へのポインタ取得 */
    double *px = (double *)x->ve_adr;
    double *py = (double *)y->ve_adr;

    /* 配列xに対する属性取得：次元数、配列サイズ、ストライド、形状、 各データのバイト数 */
    uint64_t *px_attr = (uint64_t *)x_attributes->ve_adr;
    px_attr[0] = x->ndim;
    px_attr[1] = x->size;
    px_attr[2] = x->strides[x->ndim - 2];
    px_attr[3] = x->strides[x->ndim - 1];
    px_attr[4] = x->shape[x->ndim - 2];
    px_attr[5] = x->shape[x->ndim - 1];
    px_attr[6] = x->itemsize;

    /* 配列yに対する属性取得：次元数、配列サイズ、ストライド、形状、各データのバイト数 */
    uint64_t *py_attr = (uint64_t *)y_attributes->ve_adr;
    py_attr[0] = y->ndim;
    py_attr[1] = y->size;
    py_attr[2] = y->strides[y->ndim - 3];
    py_attr[3] = y->strides[y->ndim - 2];
    py_attr[4] = y->strides[y->ndim - 1];
    py_attr[5] = y->shape[y->ndim - 3];
    py_attr[6] = y->shape[y->ndim - 2];
    py_attr[7] = y->shape[y->ndim - 1];
    py_attr[8] = y->itemsize;
 }
'''


                                                                import nlcpy 
from nlcpy import veo
from nlcpy import ve_types
import numpy

ve_lib = nlcpy.jit.CustomVELibrary(code=c_src)

#　関数定義
ve_get_a = ve_lib.get_function(
    've_get_a',
    args_type=(ve_types.void_p, ve_types.void_p, ve_types.void_p, ve_types.void_p),
    ret_type=ve_types.void
)


                                                                x = nlcpy.arange(20, dtype='f4').reshape((4, 5))
y = nlcpy.arange(60, dtype='f8').reshape((3, 4, 5))
x_attr = nlcpy.empty(7, dtype='u8')
y_attr = nlcpy.empty(9, dtype='u8')

#　関数ve_get_a実行
ve_get_a(x, y, x_attr, y_attr)


                                                                "x->ndim: {}, x->size: {}, x->strides: {}, x->shape: {}, x->itemsize: {}".format(x_attr[0], x_attr[1], x_attr[2:4], x_attr[4:6], x_attr[6])

'x->ndim: 2, x->size: 20, x->strides: [20  4], x->shape: [4 5], x->itemsize: 4'


                                                                "y->ndim: {}, y->size: {}, y->strides: {}, y->shape: {}, y->itemsize: {}".format(y_attr[0], y_attr[1], y_attr[2:5], y_attr[5:8], y_attr[8])

'y->ndim: 3, y->size: 60, y->strides: [160  40   8], y->shape: [3 4 5], y->itemsize: 8'


                                                                c_src=r'''
#include <nlcpy.h>

void ve_add(ve_array *x, ve_array *y, ve_array *z) {
    /* 配列のポインターを取得 */
    double *px = (double *)x->ve_adr;
    double *py = (double *)y->ve_adr;
    double *pz = (double *)z->ve_adr;
    /* 各配列におけるストライドを取得 */
    uint64_t ix0 = x->strides[x->ndim-1] / x->itemsize;
    uint64_t ix1 = x->strides[x->ndim-2] / x->itemsize;
    uint64_t iy0 = y->strides[y->ndim-1] / y->itemsize;
    uint64_t iy1 = y->strides[y->ndim-2] / y->itemsize;
    uint64_t iz0 = z->strides[z->ndim-1] / z->itemsize;
    uint64_t iz1 = z->strides[z->ndim-2] / z->itemsize;
    /* 配列要素の足し合わせ */
    #pragma omp parallel for
    for (int i = 0; i  < z->shape[z->ndim-2]; i++) {
        for (int j = 0; j < z->shape[z->ndim-1]; j++) {
            pz[i*iz1 + j*iz0] = px[i*ix1 + j*ix0] + py[i*iy1 + j*iy0];
        }
    }
}
'''


                                                                ve_lib = nlcpy.jit.CustomVELibrary(code=c_src)

ve_add = ve_lib.get_function(
    've_add',
    args_type=(ve_types.void_p, ve_types.void_p, ve_types.void_p),
    ret_type=ve_types.void
)


                                                                x = nlcpy.arange(20, dtype='f8').reshape((5, 4))
y = nlcpy.arange(20, dtype='f8').reshape((4, 5))
z = nlcpy.arange(20, dtype='f8').reshape((4, 5))

ve_add(x.T, y, z)

x.T

array([[ 0.,  4.,  8., 12., 16.],
       [ 1.,  5.,  9., 13., 17.],
       [ 2.,  6., 10., 14., 18.],
       [ 3.,  7., 11., 15., 19.]])


                                                                x.T.strides

(8, 32)

y

array([[ 0.,  1.,  2.,  3.,  4.],
       [ 5.,  6.,  7.,  8.,  9.],
       [10., 11., 12., 13., 14.],
       [15., 16., 17., 18., 19.]])


                                                                y.strides

(40, 8)

z

array([[ 0.,  5., 10., 15., 20.],
       [ 6., 11., 16., 21., 26.],
       [12., 17., 22., 27., 32.],
       [18., 23., 28., 33., 38.]])


                                                                src = r'''
#include <stdint.h>
void onstack_test(float *a, float *b) {
    b[0] = a[0] + a[1];
}
'''

ve_lib = nlcpy.jit.CustomVELibrary(code=src)
on_stack = ve_lib.get_function(
    'onstack_test',
    args_type=(ve_types.void_p, ve_types.void_p),
    ret_type=ve_types.void
)

a = numpy.array([1, 2], dtype='f4')
b = numpy.empty(1, dtype='f4')
print("a: {}, b: {}".format(a, b))

# OnStack命令を使用したBuffer dataのVH-VE間転送
on_stack(
    veo.OnStack(a, inout=veo.INTENT_IN),
    veo.OnStack(b, inout=veo.INTENT_OUT),
    sync=True
)
print("a: {}, b: {}".format(a, b))

a: [1. 2.], b: [0.]
a: [1. 2.], b: [3.]


                                                                import string
err = {
    'ERR_OK': 0,
    'ERR_MEMORY': 1,
    'ERR_NDIM': 2,
    'ERR_DTYPE': 3,
    'ERR_CONTIGUOUS': 4,
}

temp = string.Template(r'''
#include <nlcpy.h>
#include <stdlib.h>

uint64_t callback_test(ve_array *x) {
    double *px = (double *)x->ve_adr;
    if (px == NULL) return ${ERR_MEMORY};
    if (x->ndim != 1) return ${ERR_NDIM};
    if (x->dtype != ve_f64) return ${ERR_DTYPE};
    if (! (x->is_c_contiguous & x->is_f_contiguous)) return ${ERR_CONTIGUOUS};
    /* do something here */
    return ${ERR_OK};
}
''')
src = temp.substitute(err)


ve_lib = nlcpy.jit.CustomVELibrary(code=src)
callback_test = ve_lib.get_function(
    'callback_test',
    args_type=(ve_types.void_p,),
    ret_type=ve_types.uint64
)
# callback関数を定義
def err_print(retval):
    # reverse lookup
    for k, v in err.items():
        if retval == v:
            print(k)
            return
    raise Exception


                                                                x = nlcpy.arange(9, dtype='f8')
_ = callback_test(x, callback=err_print, sync=True)

ERR_OK


                                                                x = nlcpy.arange(9, dtype='f4')
_ = callback_test(x, callback=err_print, sync=True)

ERR_DTYPE


                                                                x = nlcpy.arange(9, dtype='f8')[::2]
_ = callback_test(x, callback=err_print, sync=True)

ERR_CONTIGUOUS


                                                                import numpy as np
import nlcpy as vp
from nlcpy import ve_types
from matplotlib import pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import time
import contextlib


                                                                N_SAMPLES    = 2000000  # The number of samples
N_CLUSTERS   =       7  # The number of clusters
MAX_ITER     =     300  # The number of maximum iterations
N_DRAW       =   10000  # The number of samples for drawing
N_KMEANS_PP  =      10  # The number of iterations for k-means++


                                                                src = r'''
#include <stdio.h>
#include <stdlib.h>
#include <nlcpy.h>

int64_t update_centroid(ve_array *label,
                        ve_array *s,
                        ve_array *c,
                        int64_t n_clusters) {
    int64_t *plabel = (int64_t *)label->ve_adr;
    double *ps = (double *)s->ve_adr;
    double *pc = (double *)c->ve_adr;
    if (plabel == NULL || ps == NULL || pc == NULL) return -1;

    int32_t *pmask = (int32_t *)malloc(sizeof(int32_t) * n_clusters * label->shape[0]);
    if (pmask == NULL) return -1;

    double *psums = (double *)malloc(sizeof(double) * n_clusters * s->shape[0]);
    if (psums == NULL) return -1;

    int32_t *pcounts = (int32_t *)malloc(sizeof(int32_t) * n_clusters);
    if (pcounts == NULL) return -1;

    const int64_t il0 = label->strides[0] / label->itemsize;
    const int64_t is0 = s->strides[0] / s->itemsize;
    const int64_t is1 = s->strides[1] / s->itemsize;
    const int64_t ic0 = c->strides[0] / c->itemsize;
    const int64_t ic1 = c->strides[1] / c->itemsize;

    // mask = (label == vp.arange(N_CLUSTERS)[:, None])
    #pragma omp parallel for
    for (int64_t i = 0; i < n_clusters; i++) {
        for (int64_t j = 0; j < label->shape[0]; j++) {
            pmask[i*label->shape[0] + j] = (plabel[j * il0] == i) ? 1 : 0;
        }
    }

    // sums = vp.where(mask[:, None, :], s.T, 0).sum(axis=2)
    #pragma omp parallel for
    for (int64_t i = 0; i < n_clusters; i++) {
        for (int64_t j = 0; j < s->shape[0]; j++) {
            psums[i*s->shape[0] + j] = 0;
            for (int64_t k = 0; k < s->shape[1]; k++) {
                if (pmask[i*s->shape[1] + k] == 1) {
                    psums[i*s->shape[0] + j] += ps[j*is0 + k*is1];
                }
            }
        }
    }

    // counts = mask.sum(axis=1).reshape((N_CLUSTERS, 1))
    #pragma omp parallel for
    for (int64_t i = 0; i < n_clusters; i++) {
        pcounts[i] = 0;
        for (int64_t j = 0; j < label->shape[0]; j++) {
            pcounts[i] += pmask[i*label->shape[0] + j];
        }
    }

    // c = sums / counts
    for (int64_t i = 0; i < c->shape[0]; i++) {
        #pragma _NEC novector
        for (int64_t j = 0; j < c->shape[1]; j++) {
            pc[i*ic0 + j*ic1] = psums[i*s->shape[0] + j] / pcounts[i];
        }
    }

    free(pmask);
    free(psums);
    free(pcounts);

    return 0;
}
'''


                                                                def draw(s, c, l, it):
    # Plot the samples and centroids of the fitted clusters into an image file.
    fig = plt.figure()
    np.random.seed(777)
    colors = np.random.rand(N_CLUSTERS, 3)
    ind = np.random.randint(0, N_SAMPLES, N_DRAW)
    s = s[ind]
    l = l[ind]
    plt.text(0, 0, 'number of iterations: {}'.format(it),
             fontsize='large')
    for i in range(N_CLUSTERS):
        labels = s[l == i, :]
        plt.scatter(labels[:, 0], labels[:, 1], color=colors[i, :])
    plt.scatter(
            c[:, 0], c[:, 1], s=120, marker='s', facecolors=colors,
            edgecolors='k')
    plt.show()


                                                                def kmeans_nlcpy(data, center, jit=True):
    vp.request.flush()
    t0 = time.time()

    if jit:
        ve_lib = vp.jit.CustomVELibrary(
            code=src,
            ftrace=True,
        )
        update_kernel = ve_lib.get_function(
            'update_centroid',
            args_type=(
                ve_types.void_p,
                ve_types.void_p,
                ve_types.void_p,
                ve_types.int64),
            ret_type=ve_types.int64
        )

    data = vp.asarray(data)
    center = vp.asarray(center)

    label = vp.zeros(N_SAMPLES, dtype='i8')

    for i in range(MAX_ITER):
        # Estimate the distance and label for each samples
        d = vp.linalg.norm(data[None, :, :] - center[:, None, :], axis=2)
        label_new = vp.argmin(d, axis=0)
        if vp.all(label == label_new):
            break
        label = label_new

        if jit:
            update_kernel(label, data.T, center, N_CLUSTERS)
        else:
            mask = (label == vp.arange(N_CLUSTERS)[:, None])
            sums = vp.where(mask[:, None, :], data.T, 0).sum(axis=2)
            counts = mask.sum(axis=1).reshape((N_CLUSTERS, 1))
            center = sums / counts

    vp.request.flush()
    t1 = time.time()
    name = 'NLCPy_JIT' if jit else 'NLCPy\t'
    print('{}\t: {} [sec]'.format(name, t1 - t0))

    draw(
        data.get(),
        center.get(),
        label.get(),
        i + 1,
    )


                                                                def kmeans_sklearn(data, init_center):
    k_means = KMeans(
        random_state=111,
        init=init_center,
        n_clusters=N_CLUSTERS,
        max_iter=MAX_ITER,
        n_init=1,
        tol=1e-9
    )
    t0 = time.time()
    k_means.fit(data)
    t1 = time.time()
    print('Scikit-Learn\t: {} [sec]'.format(t1 - t0))

    draw(
        data,
        k_means.cluster_centers_,
        k_means.labels_,
        k_means.n_iter_,
    )


                                                                def kmeans_pp(data, n_clusters, n_samples):
    center = np.empty((n_clusters, 2))
    dist = np.zeros((n_clusters, n_samples))
    center[0] = data[np.random.randint(0, n_samples)]
    dist[0] = np.linalg.norm(data - center[0], axis=1)
    ind_pool = np.arange(n_samples)

    for i in range(1, n_clusters):
        _dist = dist[:i].min(axis=0)
        p = _dist / _dist.sum()
        ind = np.random.choice(ind_pool, 1, p=p)
        center[i] = data[ind]
        dist[i] = np.linalg.norm(data - center[i], axis=1)

    return center, dist.min(axis=0).sum()


                                                                if __name__ == '__main__':
    assert N_SAMPLES >= N_CLUSTERS
    assert N_SAMPLES >= N_DRAW

    data, _ = make_blobs(
        random_state=111,
        n_samples=N_SAMPLES,
        n_features=2,
        cluster_std=.7,
        centers=N_CLUSTERS,
    )
    
    print('k-means++', end='', flush=True)
    np.random.seed(111)
    init_center, min_score = kmeans_pp(data, N_CLUSTERS, N_SAMPLES)
    for _ in range(N_KMEANS_PP-1):
        center, score = kmeans_pp(data, N_CLUSTERS, N_SAMPLES)
        if score < min_score:
            init_center = center
            min_score = score
        print('.', end='', flush=True)
    print('done')
    print('initial-centroids')
    draw(data, init_center, np.zeros(N_SAMPLES, dtype='i8'), 0)
    
    kmeans_sklearn(data, init_center)
    kmeans_nlcpy(data, init_center, jit=False)
    kmeans_nlcpy(data, init_center, jit=True)

k-means++.........done
initial-centroids

Scikit-Learn	: 0.5512814521789551 [sec]

NLCPy		: 0.47018861770629883 [sec]

NLCPy_JIT	: 0.2472062110900879 [sec]

BluStellar（ブルーステラ）

製品・ソリューション

業種・業務

企業情報

サイト内の現在位置

NumPy互換数値演算ライブラリNLCPy　NLCPy ndarray属性の取得、callback、そしてk-meansクラスタリング処理サンプル

1. NLCPy ndarray属性の取得¶

Sample-1¶

Sample-2¶

2. on_stack命令によるNumPy ndarrayデータのベクトルエンジンへの転送¶

Sample-3¶

3. C/C++/Fortran記述関数からcallback¶

Sample-4¶

4. k-meansクラスタリング処理におけるscikit-learn等との実行時間比較¶

関連リンク

サイト内の現在位置

NumPy互換数値演算ライブラリNLCPy NLCPy ndarray属性の取得、callback、そしてk-meansクラスタリング処理サンプル

1. NLCPy ndarray属性の取得¶

Sample-1¶

Sample-2¶

2. on_stack命令によるNumPy ndarrayデータのベクトルエンジンへの転送¶

Sample-3¶

3. C/C++/Fortran記述関数からcallback¶

Sample-4¶

4. k-meansクラスタリング処理におけるscikit-learn等との実行時間比較¶

関連リンク

NumPy互換数値演算ライブラリNLCPy　NLCPy ndarray属性の取得、callback、そしてk-meansクラスタリング処理サンプル