Проект логистической регрессии — Стиль жизни

В этом проекте мы будем работать с набором данных UCI для взрослых. Мы попытаемся предсказать, относятся ли люди в наборе данных к определенному классу по зарплате, зарабатывая ‹=50k или ›50k в год.

Получить данные

library(readr) a<-read.csv(‘adult_sal.csv’) head(a) ## X age type_employer fnlwgt education education_num marital ## 1 1 39 State-gov 77516 Bachelors 13 Never-married ## 2 2 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse ## 3 3 38 Private 215646 HS-grad 9 Divorced ## 4 4 53 Private 234721 11th 7 Married-civ-spouse ## 5 5 28 Private 338409 Bachelors 13 Married-civ-spouse ## 6 6 37 Private 284582 Masters 14 Married-civ-spouse ## occupation relationship race sex capital_gain capital_loss ## 1 Adm-clerical Not-in-family White Male 2174 0 ## 2 Exec-managerial Husband White Male 0 0 ## 3 Handlers-cleaners Not-in-family White Male 0 0 ## 4 Handlers-cleaners Husband Black Male 0 0 ## 5 Prof-specialty Wife Black Female 0 0 ## 6 Exec-managerial Wife White Female 0 0 ## hr_per_week country income ## 1 40 United-States <=50K ## 2 13 United-States <=50K ## 3 40 United-States <=50K ## 4 40 United-States <=50K ## 5 40 Cuba <=50K ## 6 40 United-States <=50K str(a) ## ‘data.frame’: 32561 obs. of 16 variables: ## $ X : int 1 2 3 4 5 6 7 8 9 10 … ## $ age : int 39 50 38 53 28 37 49 52 31 42 … ## $ type_employer: chr «State-gov» «Self-emp-not-inc» «Private» «Private» … ## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 … ## $ education : chr «Bachelors» «Bachelors» «HS-grad» «11th» … ## $ education_num: int 13 13 9 7 13 14 5 9 14 13 … ## $ marital : chr «Never-married» «Married-civ-spouse» «Divorced» «Married-civ-spouse» … ## $ occupation : chr «Adm-clerical» «Exec-managerial» «Handlers-cleaners» «Handlers-cleaners» … ## $ relationship : chr «Not-in-family» «Husband» «Not-in-family» «Husband» … ## $ race : chr «White» «White» «White» «Black» … ## $ sex : chr «Male» «Male» «Male» «Male» … ## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 … ## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 … ## $ hr_per_week : int 40 13 40 40 40 40 16 45 50 40 … ## $ country : chr «United-States» «United-States» «United-States» «United-States» … ## $ income : chr «<=50K» «<=50K» «<=50K» «<=50K» … summary(a) ## X age type_employer fnlwgt ## Min. : 1 Min. :17.00 Length:32561 Min. : 12285 ## 1st Qu.: 8141 1st Qu.:28.00 Class :character 1st Qu.: 117827 ## Median :16281 Median :37.00 Mode :character Median : 178356 ## Mean :16281 Mean :38.58 Mean : 189778 ## 3rd Qu.:24421 3rd Qu.:48.00 3rd Qu.: 237051 ## Max. :32561 Max. :90.00 Max. :1484705 ## education education_num marital occupation ## Length:32561 Min. : 1.00 Length:32561 Length:32561 ## Class :character 1st Qu.: 9.00 Class :character Class :character ## Mode :character Median :10.00 Mode :character Mode :character ## Mean :10.08 ## 3rd Qu.:12.00 ## Max. :16.00 ## relationship race sex capital_gain ## Length:32561 Length:32561 Length:32561 Min. : 0 ## Class :character Class :character Class :character 1st Qu.: 0 ## Mode :character Mode :character Mode :character Median : 0 ## Mean : 1078 ## 3rd Qu.: 0 ## Max. :99999 ## capital_loss hr_per_week country income ## Min. : 0.0 Min. : 1.00 Length:32561 Length:32561 ## 1st Qu.: 0.0 1st Qu.:40.00 Class :character Class :character ## Median : 0.0 Median :40.00 Mode :character Mode :character ## Mean : 87.3 Mean :40.44 ## 3rd Qu.: 0.0 3rd Qu.:45.00 ## Max. :4356.0 Max. :99.00

Процесс очистки данных «столбец type_employe»

table(a$type_employer) ## ## ? Federal-gov Local-gov Never-worked ## 1836 960 2093 7 ## Private Self-emp-inc Self-emp-not-inc State-gov ## 22696 1116 2541 1298 ## Without-pay ## 14 unemp <- function(x) { x <- as.character(x) if(x==’Never-worked’|x==’Without-pay’) { return(‘Unemployed’) }else{ return(x) } } a$type_employer<-sapply(a$type_employer,unemp) table(a$type_employer) ## ## ? Federal-gov Local-gov Private ## 1836 960 2093 22696 ## Self-emp-inc Self-emp-not-inc State-gov Unemployed ## 1116 2541 1298 21 group_emp <- function(job){ if (job==’Local-gov’ | job==’State-gov’){ return(‘SL-gov’) }else if (job==’Self-emp-inc’ | job==’Self-emp-not-inc’){ return(‘self-emp’) }else{ return(job) } } a$type_employer<-sapply(a$type_employer,group_emp) table(a$type_employer) ## ## ? Federal-gov Private self-emp SL-gov Unemployed ## 1836 960 22696 3657 3391 21

Процесс очистки данных «Военная колонна»

table(a$marital) ## ## Divorced Married-AF-spouse Married-civ-spouse ## 4443 23 14976 ## Married-spouse-absent Never-married Separated ## 418 10683 1025 ## Widowed ## 993 group_marital <- function(mar){ mar <- as.character(mar) # Not-Married if (mar==’Separated’ | mar==’Divorced’ | mar==’Widowed’){ return(‘Not-Married’) # Never-Married }else if(mar==’Never-married’){ return(mar) #Married }else{ return(‘Married’) } } a$marital<-sapply(a$marital,group_marital) table(a$marital) ## ## Married Never-married Not-Married ## 15417 10683 6461

Процесс очистки даты «колонка страны»

table(a$country) ## ## ? Cambodia ## 583 19 ## Canada China ## 121 75 ## Columbia Cuba ## 59 95 ## Dominican-Republic Ecuador ## 70 28 ## El-Salvador England ## 106 90 ## France Germany ## 29 137 ## Greece Guatemala ## 29 64 ## Haiti Holand-Netherlands ## 44 1 ## Honduras Hong ## 13 20 ## Hungary India ## 13 100 ## Iran Ireland ## 43 24 ## Italy Jamaica ## 73 81 ## Japan Laos ## 62 18 ## Mexico Nicaragua ## 643 34 ## Outlying-US(Guam-USVI-etc) Peru ## 14 31 ## Philippines Poland ## 198 60 ## Portugal Puerto-Rico ## 37 114 ## Scotland South ## 12 80 ## Taiwan Thailand ## 51 18 ## Trinadad&Tobago United-States ## 19 29170 ## Vietnam Yugoslavia ## 67 16 levels(a$country) ## NULL asia <- c(‘China’,’Hong’,’India’,’Iran’,’Cambodia’,’Japan’, ‘Laos’ , ‘Philippines’ ,’Vietnam’ ,’Taiwan’, ‘Thailand’) north.America <- c(‘Canada’,’United-States’,’Puerto-Rico’ ) europe <- c(‘England’ ,’France’, ‘Germany’ ,’Greece’,’Holand-Netherlands’,’Hungary’, ‘Ireland’,’Italy’,’Poland’,’Portugal’,’Scotland’,’Yugoslavia’) latin.and.South.America <- c(‘Columbia’,’Cuba’,’Dominican-Republic’,’Ecuador’, ‘El-Salvador’,’Guatemala’,’Haiti’,’Honduras’, ‘Mexico’,’Nicaragua’,’Outlying-US(Guam-USVI-etc)’,’Peru’, ‘Jamaica’,’Trinadad&Tobago’) other <- c(‘South’) group_country <- function(cry){ if (cry %in% asia){ return(‘Asia’) }else if (cry %in% north.America){ return(‘North.America’) }else if (cry %in% europe){ return(‘Europe’) }else if (cry %in% latin.and.South.America){ return(‘Latin.and.South.America’) }else{ return(‘Other’) } } a$country <- sapply(a$country,group_country) table(a$country) ## ## Asia Europe Latin.and.South.America ## 671 521 1301 ## North.America Other ## 29405 663 str(a) ## ‘data.frame’: 32561 obs. of 16 variables: ## $ X : int 1 2 3 4 5 6 7 8 9 10 … ## $ age : int 39 50 38 53 28 37 49 52 31 42 … ## $ type_employer: chr «SL-gov» «self-emp» «Private» «Private» … ## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 … ## $ education : chr «Bachelors» «Bachelors» «HS-grad» «11th» … ## $ education_num: int 13 13 9 7 13 14 5 9 14 13 … ## $ marital : chr «Never-married» «Married» «Not-Married» «Married» … ## $ occupation : chr «Adm-clerical» «Exec-managerial» «Handlers-cleaners» «Handlers-cleaners» … ## $ relationship : chr «Not-in-family» «Husband» «Not-in-family» «Husband» … ## $ race : chr «White» «White» «White» «Black» … ## $ sex : chr «Male» «Male» «Male» «Male» … ## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 … ## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 … ## $ hr_per_week : int 40 13 40 40 40 40 16 45 50 40 … ## $ country : chr «North.America» «North.America» «North.America» «North.America» … ## $ income : chr «<=50K» «<=50K» «<=50K» «<=50K» … a$type_employer <- sapply(a$type_employer,factor) a$country <- sapply(a$country,factor) a$marital <- sapply(a$marital,factor) a$income <- sapply(a$income,factor) str(a) ## ‘data.frame’: 32561 obs. of 16 variables: ## $ X : int 1 2 3 4 5 6 7 8 9 10 … ## $ age : int 39 50 38 53 28 37 49 52 31 42 … ## $ type_employer: Factor w/ 6 levels «SL-gov»,»self-emp»,..: 1 2 3 3 3 3 3 2 3 3 … ## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 … ## $ education : chr «Bachelors» «Bachelors» «HS-grad» «11th» … ## $ education_num: int 13 13 9 7 13 14 5 9 14 13 … ## $ marital : Factor w/ 3 levels «Never-married»,..: 1 2 3 2 2 2 2 2 1 2 … ## $ occupation : chr «Adm-clerical» «Exec-managerial» «Handlers-cleaners» «Handlers-cleaners» … ## $ relationship : chr «Not-in-family» «Husband» «Not-in-family» «Husband» … ## $ race : chr «White» «White» «White» «Black» … ## $ sex : chr «Male» «Male» «Male» «Male» … ## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 … ## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 … ## $ hr_per_week : int 40 13 40 40 40 40 16 45 50 40 … ## $ country : Factor w/ 5 levels «North.America»,..: 1 1 1 1 2 1 2 1 1 1 … ## $ income : Factor w/ 2 levels «<=50K»,»>50K»: 1 1 1 1 1 1 1 2 2 2 …

Найдите недостающие данные с помощью (пакет Amelia)

library(Amelia) ## Loading required package: Rcpp ## ## ## ## Amelia II: Multiple Imputation ## ## (Version 1.7.6, built: 2019-11-24) ## ## Copyright (C) 2005-2020 James Honaker, Gary King and Matthew Blackwell ## ## Refer to https://gking.harvard.edu/amelia/ for more information ## ## a[a==’?’] <- NA table(a$type_employer) ## ## SL-gov self-emp Private Federal-gov ? Unemployed ## 3391 3657 22696 960 0 21 missmap(a,y.at=c(1),y.labels = c(»),col=c(‘yellow’,’black’))

Исследовательский анализ данных

str(a) ## ‘data.frame’: 32561 obs. of 16 variables: ## $ X : int 1 2 3 4 5 6 7 8 9 10 … ## $ age : int 39 50 38 53 28 37 49 52 31 42 … ## $ type_employer: Factor w/ 6 levels «SL-gov»,»self-emp»,..: 1 2 3 3 3 3 3 2 3 3 … ## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 … ## $ education : chr «Bachelors» «Bachelors» «HS-grad» «11th» … ## $ education_num: int 13 13 9 7 13 14 5 9 14 13 … ## $ marital : Factor w/ 3 levels «Never-married»,..: 1 2 3 2 2 2 2 2 1 2 … ## $ occupation : chr «Adm-clerical» «Exec-managerial» «Handlers-cleaners» «Handlers-cleaners» … ## $ relationship : chr «Not-in-family» «Husband» «Not-in-family» «Husband» … ## $ race : chr «White» «White» «White» «Black» … ## $ sex : chr «Male» «Male» «Male» «Male» … ## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 … ## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 … ## $ hr_per_week : int 40 13 40 40 40 40 16 45 50 40 … ## $ country : Factor w/ 5 levels «North.America»,..: 1 1 1 1 2 1 2 1 1 1 … ## $ income : Factor w/ 2 levels «<=50K»,»>50K»: 1 1 1 1 1 1 1 2 2 2 … library(ggplot2) library(dplyr) ## ## Attaching package: ‘dplyr’ ## The following objects are masked from ‘package:stats’: ## ## filter, lag ## The following objects are masked from ‘package:base’: ## ## intersect, setdiff, setequal, union ggplot(a,aes(age))+geom_histogram(aes(fill=income),color=’black’,binwidth=1)+theme_bw() ggplot(a,aes(hr_per_week))+geom_histogram()+theme_bw() ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. names(a)[names(a)==’country’] <-‘region’ str(a) ## ‘data.frame’: 32561 obs. of 16 variables: ## $ X : int 1 2 3 4 5 6 7 8 9 10 … ## $ age : int 39 50 38 53 28 37 49 52 31 42 … ## $ type_employer: Factor w/ 6 levels «SL-gov»,»self-emp»,..: 1 2 3 3 3 3 3 2 3 3 … ## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 … ## $ education : chr «Bachelors» «Bachelors» «HS-grad» «11th» … ## $ education_num: int 13 13 9 7 13 14 5 9 14 13 … ## $ marital : Factor w/ 3 levels «Never-married»,..: 1 2 3 2 2 2 2 2 1 2 … ## $ occupation : chr «Adm-clerical» «Exec-managerial» «Handlers-cleaners» «Handlers-cleaners» … ## $ relationship : chr «Not-in-family» «Husband» «Not-in-family» «Husband» … ## $ race : chr «White» «White» «White» «Black» … ## $ sex : chr «Male» «Male» «Male» «Male» … ## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 … ## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 … ## $ hr_per_week : int 40 13 40 40 40 40 16 45 50 40 … ## $ region : Factor w/ 5 levels «North.America»,..: 1 1 1 1 2 1 2 1 1 1 … ## $ income : Factor w/ 2 levels «<=50K»,»>50K»: 1 1 1 1 1 1 1 2 2 2 … ggplot(a,aes(region))+geom_bar(aes(fill=income),color=’black’)+theme_bw() theme(axis.text.x = element_text(angle = 90, hjust = 1)) ## List of 1 ## $ axis.text.x:List of 11 ## ..$ family : NULL ## ..$ face : NULL ## ..$ colour : NULL ## ..$ size : NULL ## ..$ hjust : num 1 ## ..$ vjust : NULL ## ..$ angle : num 90 ## ..$ lineheight : NULL ## ..$ margin : NULL ## ..$ debug : NULL ## ..$ inherit.blank: logi FALSE ## ..- attr(*, «class»)= chr [1:2] «element_text» «element» ## — attr(*, «class»)= chr [1:2] «theme» «gg» ## — attr(*, «complete»)= logi FALSE ## — attr(*, «validate»)= logi TRUE head(a) ## X age type_employer fnlwgt education education_num marital ## 1 1 39 SL-gov 77516 Bachelors 13 Never-married ## 2 2 50 self-emp 83311 Bachelors 13 Married ## 3 3 38 Private 215646 HS-grad 9 Not-Married ## 4 4 53 Private 234721 11th 7 Married ## 5 5 28 Private 338409 Bachelors 13 Married ## 6 6 37 Private 284582 Masters 14 Married ## occupation relationship race sex capital_gain capital_loss ## 1 Adm-clerical Not-in-family White Male 2174 0 ## 2 Exec-managerial Husband White Male 0 0 ## 3 Handlers-cleaners Not-in-family White Male 0 0 ## 4 Handlers-cleaners Husband Black Male 0 0 ## 5 Prof-specialty Wife Black Female 0 0 ## 6 Exec-managerial Wife White Female 0 0 ## hr_per_week region income ## 1 40 North.America <=50K ## 2 13 North.America <=50K ## 3 40 North.America <=50K ## 4 40 North.America <=50K ## 5 40 Latin.and.South.America <=50K ## 6 40 North.America <=50K

Тренируйтесь и тестируйте

library(caTools) set.seed(101) sample <- sample.split(a$income,SplitRatio=0.7) train <-subset(a,sample == T) test <-subset(a,sample == F)

Развертывание модели

model <- glm(income ~. ,family=binomial(logit),data = train) ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred summary(model) ## ## Call: ## glm(formula = income ~ ., family = binomial(logit), data = train) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -3.4277 -0.5145 -0.1894 0.0000 3.8037 ## ## Coefficients: (1 not defined because of singularities) ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -7.533e+00 4.351e-01 -17.311 < 2e-16 *** ## X 3.063e-06 2.270e-06 1.349 0.177248 ## age 2.696e-02 2.029e-03 13.287 < 2e-16 *** ## type_employerself-emp -1.110e-01 9.024e-02 -1.231 0.218492 ## type_employerPrivate 2.198e-01 7.284e-02 3.017 0.002554 ** ## type_employerFederal-gov 7.061e-01 1.246e-01 5.669 1.44e-08 *** ## type_employerUnemployed -1.452e+01 6.500e+02 -0.022 0.982177 ## fnlwgt 5.958e-07 2.062e-07 2.889 0.003868 ** ## education11th -1.237e-01 2.496e-01 -0.496 0.620056 ## education12th 2.807e-01 3.211e-01 0.874 0.382009 ## education1st-4th -8.216e-01 6.050e-01 -1.358 0.174468 ## education5th-6th -1.079e+00 4.778e-01 -2.259 0.023883 * ## education7th-8th -9.080e-01 2.924e-01 -3.106 0.001899 ** ## education9th -2.828e-01 3.020e-01 -0.936 0.349026 ## educationAssoc-acdm 1.094e+00 2.082e-01 5.253 1.50e-07 *** ## educationAssoc-voc 1.157e+00 2.005e-01 5.771 7.87e-09 *** ## educationBachelors 1.764e+00 1.856e-01 9.503 < 2e-16 *** ## educationDoctorate 2.902e+00 2.573e-01 11.279 < 2e-16 *** ## educationHS-grad 6.261e-01 1.800e-01 3.479 0.000503 *** ## educationMasters 2.061e+00 1.985e-01 10.384 < 2e-16 *** ## educationPreschool -2.037e+01 2.719e+02 -0.075 0.940295 ## educationProf-school 2.549e+00 2.395e-01 10.644 < 2e-16 *** ## educationSome-college 9.107e-01 1.831e-01 4.974 6.55e-07 *** ## education_num NA NA NA NA ## maritalMarried 1.413e+00 1.990e-01 7.103 1.22e-12 *** ## maritalNot-Married 4.855e-01 1.013e-01 4.795 1.63e-06 *** ## occupationArmed-Forces -5.780e-01 1.915e+00 -0.302 0.762760 ## occupationCraft-repair 7.650e-02 9.605e-02 0.797 0.425740 ## occupationExec-managerial 7.988e-01 9.273e-02 8.614 < 2e-16 *** ## occupationFarming-fishing -1.284e+00 1.740e-01 -7.382 1.56e-13 *** ## occupationHandlers-cleaners -6.400e-01 1.692e-01 -3.782 0.000155 *** ## occupationMachine-op-inspct -3.097e-01 1.226e-01 -2.527 0.011500 * ## occupationOther-service -9.062e-01 1.412e-01 -6.417 1.39e-10 *** ## occupationPriv-house-serv -1.345e+01 1.972e+02 -0.068 0.945626 ## occupationProf-specialty 4.525e-01 9.833e-02 4.601 4.20e-06 *** ## occupationProtective-serv 5.354e-01 1.496e-01 3.580 0.000344 *** ## occupationSales 2.491e-01 9.942e-02 2.505 0.012237 * ## occupationTech-support 6.851e-01 1.299e-01 5.276 1.32e-07 *** ## occupationTransport-moving -1.732e-01 1.191e-01 -1.454 0.145924 ## relationshipNot-in-family -7.552e-01 1.958e-01 -3.856 0.000115 *** ## relationshipOther-relative -1.132e+00 2.677e-01 -4.229 2.35e-05 *** ## relationshipOwn-child -1.711e+00 2.392e-01 -7.155 8.38e-13 *** ## relationshipUnmarried -8.609e-01 2.187e-01 -3.936 8.29e-05 *** ## relationshipWife 1.423e+00 1.253e-01 11.357 < 2e-16 *** ## raceAsian-Pac-Islander 6.837e-01 3.356e-01 2.037 0.041657 * ## raceBlack 5.544e-01 2.971e-01 1.866 0.062064 . ## raceOther -1.763e-01 4.457e-01 -0.395 0.692482 ## raceWhite 6.905e-01 2.847e-01 2.426 0.015282 * ## sexMale 9.287e-01 9.579e-02 9.695 < 2e-16 *** ## capital_gain 3.326e-04 1.289e-05 25.807 < 2e-16 *** ## capital_loss 6.587e-04 4.586e-05 14.365 < 2e-16 *** ## hr_per_week 3.157e-02 2.006e-03 15.742 < 2e-16 *** ## regionLatin.and.South.America -5.214e-01 1.595e-01 -3.269 0.001079 ** ## regionAsia -2.803e-01 2.139e-01 -1.310 0.190084 ## regionOther -4.765e-01 1.635e-01 -2.913 0.003574 ** ## regionEurope 6.083e-02 1.524e-01 0.399 0.689829 ## — ## Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ‘ 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 24139 on 21500 degrees of freedom ## Residual deviance: 13865 on 21446 degrees of freedom ## (1292 observations deleted due to missingness) ## AIC: 13975 ## ## Number of Fisher Scoring iterations: 15 new.step.model <- step(model) ## Start: AIC=13975.19 ## income ~ X + age + type_employer + fnlwgt + education + education_num + ## marital + occupation + relationship + race + sex + capital_gain + ## capital_loss + hr_per_week + region ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## ## Step: AIC=13975.19 ## income ~ X + age + type_employer + fnlwgt + education + marital + ## occupation + relationship + race + sex + capital_gain + capital_loss + ## hr_per_week + region ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Df Deviance AIC ## — X 1 13867 13975 ## <none> 13865 13975 ## — fnlwgt 1 13874 13982 ## — race 4 13881 13983 ## — region 4 13885 13987 ## — marital 2 13921 14027 ## — type_employer 4 13927 14029 ## — sex 1 13963 14071 ## — age 1 14044 14152 ## — capital_loss 1 14079 14187 ## — relationship 5 14114 14214 ## — hr_per_week 1 14121 14229 ## — occupation 13 14330 14414 ## — education 15 14596 14676 ## — capital_gain 1 15166 15274 ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## ## Step: AIC=13975.01 ## income ~ age + type_employer + fnlwgt + education + marital + ## occupation + relationship + race + sex + capital_gain + capital_loss + ## hr_per_week + region ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred ## Df Deviance AIC ## <none> 13867 13975 ## — fnlwgt 1 13875 13981 ## — race 4 13883 13983 ## — region 4 13887 13987 ## — marital 2 13923 14027 ## — type_employer 4 13929 14029 ## — sex 1 13964 14070 ## — age 1 14045 14151 ## — capital_loss 1 14081 14187 ## — relationship 5 14115 14213 ## — hr_per_week 1 14123 14229 ## — occupation 13 14332 14414 ## — education 15 14598 14676 ## — capital_gain 1 15168 15274 summary(new.step.model) ## ## Call: ## glm(formula = income ~ age + type_employer + fnlwgt + education + ## marital + occupation + relationship + race + sex + capital_gain + ## capital_loss + hr_per_week + region, family = binomial(logit), ## data = train) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -3.4361 -0.5142 -0.1895 0.0000 3.8162 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -7.482e+00 4.335e-01 -17.258 < 2e-16 *** ## age 2.696e-02 2.029e-03 13.284 < 2e-16 *** ## type_employerself-emp -1.129e-01 9.021e-02 -1.251 0.210831 ## type_employerPrivate 2.194e-01 7.282e-02 3.013 0.002588 ** ## type_employerFederal-gov 7.056e-01 1.246e-01 5.665 1.47e-08 *** ## type_employerUnemployed -1.449e+01 6.515e+02 -0.022 0.982253 ## fnlwgt 5.938e-07 2.062e-07 2.880 0.003982 ** ## education11th -1.227e-01 2.497e-01 -0.492 0.623038 ## education12th 2.850e-01 3.210e-01 0.888 0.374646 ## education1st-4th -8.209e-01 6.049e-01 -1.357 0.174767 ## education5th-6th -1.079e+00 4.781e-01 -2.256 0.024066 * ## education7th-8th -9.036e-01 2.923e-01 -3.091 0.001993 ** ## education9th -2.847e-01 3.021e-01 -0.942 0.346095 ## educationAssoc-acdm 1.096e+00 2.082e-01 5.262 1.42e-07 *** ## educationAssoc-voc 1.160e+00 2.005e-01 5.785 7.26e-09 *** ## educationBachelors 1.765e+00 1.856e-01 9.509 < 2e-16 *** ## educationDoctorate 2.903e+00 2.572e-01 11.286 < 2e-16 *** ## educationHS-grad 6.274e-01 1.800e-01 3.486 0.000491 *** ## educationMasters 2.064e+00 1.985e-01 10.394 < 2e-16 *** ## educationPreschool -2.039e+01 2.712e+02 -0.075 0.940079 ## educationProf-school 2.552e+00 2.395e-01 10.657 < 2e-16 *** ## educationSome-college 9.129e-01 1.831e-01 4.985 6.19e-07 *** ## maritalMarried 1.414e+00 1.990e-01 7.106 1.20e-12 *** ## maritalNot-Married 4.867e-01 1.013e-01 4.807 1.53e-06 *** ## occupationArmed-Forces -5.706e-01 1.898e+00 -0.301 0.763750 ## occupationCraft-repair 7.865e-02 9.602e-02 0.819 0.412730 ## occupationExec-managerial 8.006e-01 9.271e-02 8.635 < 2e-16 *** ## occupationFarming-fishing -1.282e+00 1.739e-01 -7.373 1.67e-13 *** ## occupationHandlers-cleaners -6.382e-01 1.692e-01 -3.771 0.000162 *** ## occupationMachine-op-inspct -3.097e-01 1.226e-01 -2.526 0.011526 * ## occupationOther-service -9.054e-01 1.412e-01 -6.413 1.43e-10 *** ## occupationPriv-house-serv -1.344e+01 1.973e+02 -0.068 0.945699 ## occupationProf-specialty 4.537e-01 9.831e-02 4.615 3.93e-06 *** ## occupationProtective-serv 5.358e-01 1.495e-01 3.583 0.000340 *** ## occupationSales 2.494e-01 9.940e-02 2.509 0.012108 * ## occupationTech-support 6.881e-01 1.299e-01 5.299 1.16e-07 *** ## occupationTransport-moving -1.731e-01 1.191e-01 -1.453 0.146206 ## relationshipNot-in-family -7.562e-01 1.958e-01 -3.862 0.000113 *** ## relationshipOther-relative -1.133e+00 2.679e-01 -4.230 2.33e-05 *** ## relationshipOwn-child -1.713e+00 2.392e-01 -7.159 8.11e-13 *** ## relationshipUnmarried -8.615e-01 2.187e-01 -3.939 8.17e-05 *** ## relationshipWife 1.418e+00 1.252e-01 11.326 < 2e-16 *** ## raceAsian-Pac-Islander 6.837e-01 3.356e-01 2.037 0.041637 * ## raceBlack 5.533e-01 2.972e-01 1.862 0.062594 . ## raceOther -1.757e-01 4.456e-01 -0.394 0.693325 ## raceWhite 6.895e-01 2.847e-01 2.422 0.015454 * ## sexMale 9.261e-01 9.576e-02 9.670 < 2e-16 *** ## capital_gain 3.325e-04 1.288e-05 25.808 < 2e-16 *** ## capital_loss 6.584e-04 4.584e-05 14.364 < 2e-16 *** ## hr_per_week 3.159e-02 2.006e-03 15.749 < 2e-16 *** ## regionLatin.and.South.America -5.190e-01 1.594e-01 -3.256 0.001128 ** ## regionAsia -2.813e-01 2.138e-01 -1.316 0.188178 ## regionOther -4.787e-01 1.635e-01 -2.929 0.003403 ** ## regionEurope 5.966e-02 1.524e-01 0.392 0.695399 ## — ## Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ‘ 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 24139 on 21500 degrees of freedom ## Residual deviance: 13867 on 21447 degrees of freedom ## (1292 observations deleted due to missingness) ## AIC: 13975 ## ## Number of Fisher Scoring iterations: 15 test$predicted.income = predict(model, newdata=test, type=»response») ## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == : ## prediction from a rank-deficient fit may be misleading table(test$income, test$predicted.income > 0.5) ## ## FALSE TRUE ## <=50K 6405 518 ## >50K 920 1374

точность нашей модели

6405+1374/(6405+518+920+1374) ## [1] 6405.149

отзывать

6405/(6405+518) ## [1] 0.9251769

Прецессия

6405/(6405+920) ## [1] 0.8744027

Точность нашей модели: 6405,149

Отзыв: 0,9251769

точность: 0,8744027

Источник: ledsshop.ru