@article {10.3844/jcssp.2011.39.45, article_type = {journal}, title = {Statistical Bayesian Learning for Automatic Arabic Text Categorization}, author = {Al-Salemi, Bassam and Ab Aziz, Mohd. Juzaiddin}, volume = {7}, number = {1}, year = {2010}, month = {Dec}, pages = {39-45}, doi = {10.3844/jcssp.2011.39.45}, url = {https://thescipub.com/abstract/jcssp.2011.39.45}, abstract = {Problem statement: The rapid increasing of online Arabic documents necessitated applying Text Categorization techniques that are commonly used for English language to categorize them automatically. The complex morphology of Arabic language and its large vocabulary size make applying these techniques directly difficult and costly in time and effort. Approach: We have investigated Bayesian learning models in order to enhance Arabic ATC. Three classifiers based on Bayesian theorem had been implemented which are Simple Naïve Bayes (NB), Multi-variant Bernoulli Naïve Bayes (MBNB) and Multinomial Naïve Bayes (MNB) models. TREC-2002 Light Stemmer was applied for Arabic stemming. For text representation, Bag-Of-Word and character-level n-gram with the length 3, 4 and 5 are used. In order to reduce the dimensionality of feature space, the following feature selection methods: Mutual Information, Chi-Square statistic, Odds Ratio and GSS-coefficient were used. Conclusion: MBNB classifier outperformed both of NB and MNB classifiers. BOW representation leads to the best classification performance; nevertheless, using character-level n-gram leads to satisfying results for Arabic ATC based on Bayesian learning. Moreover, the use of feature selection methods dramatically increases the categorization performance.}, journal = {Journal of Computer Science}, publisher = {Science Publications} }