@article {10.3844/jcssp.2013.1514.1518, article_type = {journal}, title = {CLUSTER BASED DUPLICATE DETECTION}, author = {Kumar, A. Venkatesh and Vengataasalam, S.}, volume = {9}, number = {11}, year = {2013}, month = {Sep}, pages = {1514-1518}, doi = {10.3844/jcssp.2013.1514.1518}, url = {https://thescipub.com/abstract/jcssp.2013.1514.1518}, abstract = {We propose a clustering technique for entropy based text dis-similarity calculation of de-duplication system. Improve the quality of grouping; in this study we propose a Multi-Level Group Detection (MLGD) algorithm which produces a most accurate group with most closely related object using Alternative Decision Tree (ADT) technique. Our propose a two new algorithm; first one is Multi-Level Group Detection (MLGD) formation using Alternative Decision Tree (AD Tree), which will split the bunch of record into self-sized cluster to reduce the volume of data for text comparisons. Second one is calculating the dis-similarity percentage using entropy and Information Gain (IG). We show experimentally our proposed technique achieves higher average accuracy than existing traditional de-duplication system. Further, our technique not required any manual tuning for clustering formations as well as dis-similarity calculation for any kind of business data. In this study, we have presented a new efficient method is introduced for clustering formation using ADTree algorithm for duplicate deduction. The new method offers more accuracy dis-similarity measure for each cluster data without manual intervention at the time of duplicate deduction.}, journal = {Journal of Computer Science}, publisher = {Science Publications} }