@article {10.3844/jcssp.2022.913.927, article_type = {journal}, title = {Classification of Non-Small Cell Lung Cancer Based on Gene Expression in Cases of Smokers and Non-Smokers using Ensemble Methods with Statistical Based Feature Selection}, author = {Nhita , Fhira and Kurniawan, Isman}, volume = {18}, number = {10}, year = {2022}, month = {Sep}, pages = {913-927}, doi = {10.3844/jcssp.2022.913.927}, url = {https://thescipub.com/abstract/jcssp.2022.913.927}, abstract = {Lung cancer is one of the leading causes of death globally. One of the main risk factors for lung can ceris smoking, which causes more than 90% of lung cancer cases. There are two types of lung cancer, i.e., Small Cell Lung Cancer (SCLC) and Non-Small Cell Lung Cancer (NSCLC), which the latter is the most common. One method that can be used to detect cancer is the implementation of machine learning on gene expression data. Machine learning is one approach that promises good performance in classifying gene expression data. This study aimed to predict the existence of NSCLC based on gene expression, whether including NSCLC or normal. We used three data sets, i.e., GSE10072, GSE19804, and GSE19188, which relate to the cases of NSCLC in smokers and nonsmokers. The prediction was carried out using six Ensemble Methods, i.e., Random Forest, Adaptive Boosting, Extra Tree, Gradient Boosting, Extreme Gradient Boosting, and Categorical Boosting. Feature selection was carried out by calculating the correlation between feature and target according to statistical parameters, i.e., ANOVA, Mutual Information (MI), and a combination of ANOVA and MI. We obtained the prediction model that outperformed the related studies for two similar datasets with the value of accuracy for the GSE10072, GSE19804, and GSE19188 datasets 100%, 97.22%, and 100%, respectively}, journal = {Journal of Computer Science}, publisher = {Science Publications} }