A very simple approach would be to find some kind of centroid for each cluster (e.g. averaging the distributions of the documents belonging to each cluster respectively) and then calculating the cosine distance of each document within the cluster from the corresponding centroid. The document with the shorter distance will be the closest to the centroid, hence the most "representative".
Continuing from the previous example:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
# Initialize some documents
doc1 = {'Science':0.8, 'History':0.05, 'Politics':0.15, 'Sports':0.1}
doc2 = {'News':0.2, 'Art':0.8, 'Politics':0.1, 'Sports':0.1}
doc3 = {'Science':0.8, 'History':0.1, 'Politics':0.05, 'News':0.1}
doc4 = {'Science':0.1, 'Weather':0.2, 'Art':0.7, 'Sports':0.1}
collection = [doc1, doc2, doc3, doc4]
df = pd.DataFrame(collection)
# Fill missing values with zeros
df.fillna(0, inplace=True)
# Get Feature Vectors
feature_matrix = df.as_matrix()
# Fit DBSCAN
db = DBSCAN(min_samples=1, metric='precomputed').fit(pairwise_distances(feature_matrix, metric='cosine'))
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
# Find the representatives
representatives = {}
for label in set(labels):
# Find indices of documents belonging to the same cluster
ind = np.argwhere(labels==label).reshape(-1,)
# Select these specific documetns
cluster_samples = feature_matrix[ind,:]
# Calculate their centroid as an average
centroid = np.average(cluster_samples, axis=0)
# Find the distance of each document from the centroid
distances = [cosine(sample_doc, centroid) for sample_doc in cluster_samples]
# Keep the document closest to the centroid as the representative
representatives[label] = cluster_samples[np.argsort(distances),:][0]
for label, doc in representatives.iteritems():
print("Label : %d -- Representative : %s" % (label, str(doc)))