python – K Group Clusterization programming challenge from justpaste.it


I have implemented k means as below, and as far as I know, it doesn’t guarantee k grouping when given k. But the task I have which is https://justpaste.it/5cmte, requires me to return exactly k clusters. How should I go around that?

Task

You are given n points.
You have to separate them into k groups (clusters).

Input:

The first line contains two numbers: n – the number of points and k – the number of groups.
The next n lines contain two numbers x and y – the coordinates of a point.

Output:

The first line should contain the sum of distances between each point to the center of its cluster.
Each of the next k lines should contain the coordinates of center of the current cluster, the number of points of the cluster and the indices of the points belonging to the cluster (all values should be separated with a whitespace).

Example input:

6 3
1 2
5 5
7 3
2 4
9 9
3 6

Output:

10.8982
1.5 3 2 1 4
3 6 1 6
7 5.66667 3 2 3 5

Sometimes, the same input, gives k-1, k-2 clusters instead of k.

So my point is that. k means does not guarantee k clusters. I am required to return exact k clusters, how should I do that?

K means:

import sys, random, math
from collections import defaultdict

def pointAvg(points):
    dimensions = len(points(0))
    newCenter = ()
    for dimension in range(dimensions):
        dimSum = 0
        for p in points:
            dimSum += p(dimension)
        newCenter.append(dimSum / float(len(points)))
    return newCenter

def updateCenters(dataSet, assignments):
    newMeans = defaultdict(list)
    centers = ()
    for assignment, point in zip(assignments, dataSet):
        newMeans(assignment).append(point)
    for points in newMeans.values():
        centers.append(pointAvg(points))
    return centers, newMeans
    
def assignPoints(dataPoints, centers):
    assignments = ()
    for point in dataPoints:
        shortest = float('inf')
        shortestIndex, distances = 0, 0
        for i in range(len(centers)):
            val = distance(point, centers(i))
            distances += val
            if val < shortest:
                shortest = val
                shortestIndex = i
        assignments.append(shortestIndex)
    return (assignments, distances)

def distance(a, b):
    dimensions = len(a)
    _sum = 0
    for dimension in range(dimensions):
        differenceSq = (a(dimension) - b(dimension)) ** 2
        _sum += differenceSq
    return math.sqrt(_sum)

def generateK(dataSet, k):
    centers = ()
    dimensions = len(dataSet(0))
    minMax = defaultdict(int)
    for point in dataSet:
        for i in range(dimensions):
            val = point(i)
            minKey = 'min_%d' % i
            maxKey = 'max_%d' % i
            if minKey not in minMax or val < minMax(minKey):
                minMax(minKey) = val
            if maxKey not in minMax or val > minMax(maxKey):
                minMax(maxKey) = val
    for _k in range(k):
        randPoint = ()
        for i in range(dimensions):
            minVal = minMax('min_%d' % i)
            maxVal = minMax('max_%d' % i)
            randPoint.append(random.uniform(minVal, maxVal))
        centers.append(randPoint)
    return centers

def kMeans(dataset, k):
    kPoints = generateK(dataset, k)
    assignments = assignPoints(dataset, kPoints)(0)
    oldAssignments = None
    while assignments != oldAssignments:
        updatedCenters = updateCenters(dataset, assignments)
        newCenters = updatedCenters(0)
        means = updatedCenters(1)
        oldAssignments = assignments
        assignedPoints = assignPoints(dataset, newCenters)
        assignments = assignedPoints(0)
        sumDistances = assignedPoints(1) 
    print(sumDistances)
    return (newCenters, means)
    
if __name__ == '__main__':

    # sys.stdin  = open('input.txt', 'r')
    # sys.stdout = open('output.txt', 'w')
    
    i, points = 0, ()
    for line in sys.stdin:
        line = line.rstrip()
        line = line.split()
        line(0), line(1) = int(line(0)), int(line(1))
        if (i == 0):
            n, k = line(0), line(1)
        else:
            points.append(line)
        i += 1
        
    clusters = kMeans(points, k)
    for i in range(k):
        print('%g' % clusters(0)(i)(0), '%g' % clusters(0)(i)(1), len(clusters(1)(i)), end=' ')
        for j in range(len(clusters(1)(i))):
            print(points.index(clusters(1)(i)(j)) + 1, end=' ')
        print()