Model Abstraction • citrus

The CITRUS package contains several different approaches to segmentation, in order to keep the approaches consistent to produce the same outputs, we require a module that extracts the relevant information from the specific approach. This module is called the model abstraction layer, this provides consistency across the different approaches and ensures the same output is produced moving into the next module.
The model abstraction layer varies for each type of approach as it pulls different information from different approaches. The standard input that it takes is the segmentation model object and the input data that is converted into a list object containing the model, hyperparameters, predicted values and the input data.
library(citrus)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

hyperparameters <- list(dependent_variable = 'response',
                       min_segmentation_fraction = 0.05,
                       number_of_segments = 6,
                       print_plot = FALSE,
                       print_safety_check=20)

formatted <- preprocess(citrus::transactional_data,
                       categories = c('country'), 
                       numeric_operation_list = c('min', 'sd'),
                       target = 'desc_chars', 
                       target_agg = 'mean')
#> Calculating target values
validate(formatted, supervised = TRUE, hyperparameters = hyperparameters)
#> [1] TRUE
model <- tree_segment(formatted, hyperparameters)
model <- tree_abstract(model, citrus::preprocessed_data)

str(model)
#> List of 5
#>  $ segment_model        :List of 15
#>   ..$ frame              :'data.frame':  11 obs. of  8 variables:
#>   .. ..$ var       : chr [1:11] "top_country" "<leaf>" "transactionvalue_min" "transactionvalue_sd" ...
#>   .. ..$ n         : int [1:11] 410 21 389 369 348 26 322 76 246 21 ...
#>   .. ..$ wt        : num [1:11] 410 21 389 369 348 26 322 76 246 21 ...
#>   .. ..$ dev       : num [1:11] 3391 342 2855 2704 2569 ...
#>   .. ..$ yval      : num [1:11] 26.6 1 26.8 26.7 26.6 ...
#>   .. ..$ complexity: num [1:11] 0.057 -1 0.0131 0.013 0.0129 ...
#>   .. ..$ ncompete  : int [1:11] 4 0 4 4 4 0 4 0 0 0 ...
#>   .. ..$ nsurrogate: int [1:11] 0 0 1 0 2 0 3 0 0 0 ...
#>   ..$ where              : int [1:410] 9 2 9 8 2 2 2 2 2 2 ...
#>   ..$ call               : language rpart(formula = f, data = df, method = "anova", control = control)
#>   ..$ terms              :Classes 'terms', 'formula'  language response ~ recency + frequency + monetary + transactionvalue_min + transactionvalue_sd +      top_country
#>   .. .. ..- attr(*, "variables")= language list(response, recency, frequency, monetary, transactionvalue_min, transactionvalue_sd,      top_country)
#>   .. .. ..- attr(*, "factors")= int [1:7, 1:6] 0 1 0 0 0 0 0 0 0 1 ...
#>   .. .. .. ..- attr(*, "dimnames")=List of 2
#>   .. .. .. .. ..$ : chr [1:7] "response" "recency" "frequency" "monetary" ...
#>   .. .. .. .. ..$ : chr [1:6] "recency" "frequency" "monetary" "transactionvalue_min" ...
#>   .. .. ..- attr(*, "term.labels")= chr [1:6] "recency" "frequency" "monetary" "transactionvalue_min" ...
#>   .. .. ..- attr(*, "order")= int [1:6] 1 1 1 1 1 1
#>   .. .. ..- attr(*, "intercept")= int 1
#>   .. .. ..- attr(*, "response")= int 1
#>   .. .. ..- attr(*, ".Environment")=<environment: 0x7fe3d0f5cc38> 
#>   .. .. ..- attr(*, "predvars")= language list(response, recency, frequency, monetary, transactionvalue_min, transactionvalue_sd,      top_country)
#>   .. .. ..- attr(*, "dataClasses")= Named chr [1:7] "numeric" "numeric" "numeric" "numeric" ...
#>   .. .. .. ..- attr(*, "names")= chr [1:7] "response" "recency" "frequency" "monetary" ...
#>   ..$ cptable            : num [1:3, 1:5] 0.0571 0.0131 0.0127 0 1 ...
#>   .. ..- attr(*, "dimnames")=List of 2
#>   .. .. ..$ : chr [1:3] "1" "2" "3"
#>   .. .. ..$ : chr [1:5] "CP" "nsplit" "rel error" "xerror" ...
#>   ..$ method             : chr "anova"
#>   ..$ parms              : NULL
#>   ..$ control            :List of 9
#>   .. ..$ minsplit      : num 40
#>   .. ..$ minbucket     : num 20
#>   .. ..$ cp            : num -1
#>   .. ..$ maxcompete    : int 4
#>   .. ..$ maxsurrogate  : int 5
#>   .. ..$ usesurrogate  : int 2
#>   .. ..$ surrogatestyle: int 0
#>   .. ..$ maxdepth      : int 30
#>   .. ..$ xval          : int 10
#>   ..$ functions          :List of 2
#>   .. ..$ summary:function (yval, dev, wt, ylevel, digits)  
#>   .. ..$ text   :function (yval, dev, wt, ylevel, digits, n, use.n)  
#>   ..$ numresp            : int 1
#>   ..$ splits             : num [1:31, 1:5] 410 410 410 378 410 389 389 389 359 389 ...
#>   .. ..- attr(*, "dimnames")=List of 2
#>   .. .. ..$ : chr [1:31] "top_country" "transactionvalue_min" "monetary" "transactionvalue_sd" ...
#>   .. .. ..$ : chr [1:5] "count" "ncat" "improve" "index" ...
#>   ..$ csplit             : int [1:2, 1:15] 3 3 1 2 3 3 1 2 1 2 ...
#>   ..$ variable.importance: Named num [1:5] 194.4 89.4 41.4 31.2 12.6
#>   .. ..- attr(*, "names")= chr [1:5] "top_country" "monetary" "transactionvalue_min" "transactionvalue_sd" ...
#>   ..$ y                  : Named num [1:410] 26.8 24.8 26.4 26 28 ...
#>   .. ..- attr(*, "names")= chr [1:410] "1" "2" "3" "4" ...
#>   ..$ ordered            : Named logi [1:6] FALSE FALSE FALSE FALSE FALSE FALSE
#>   .. ..- attr(*, "names")= chr [1:6] "recency" "frequency" "monetary" "transactionvalue_min" ...
#>   ..- attr(*, "xlevels")=List of 1
#>   .. ..$ top_country: chr [1:15] "United Kingdom" "France" "Australia" "Netherlands" ...
#>   ..- attr(*, "class")= chr "rpart"
#>  $ model_hyperparameters:List of 4
#>   ..$ segmentation_variables   : chr [1:6] "recency" "frequency" "monetary" "transactionvalue_min" ...
#>   ..$ dependent_variable       : chr "response"
#>   ..$ min_segmentation_fraction: num 0.05
#>   ..$ number_of_segments       : num 6
#>  $ segment_table        :'data.frame':   6 obs. of  8 variables:
#>   ..$ segment             : int [1:6] 1 2 3 4 5 6
#>   ..$ mean_value          : num [1:6] 23.7 28 27.9 25.4 26.2 ...
#>   ..$ percentage          : num [1:6] 5.12 4.88 5.12 6.34 18.54 ...
#>   ..$ n                   : int [1:6] 21 20 21 26 76 246
#>   ..$ top_country         : chr [1:6] "France, Germany, Netherlands, Portugal, Spain, Switzerland" "Australia, Belgium, EIRE, Italy, Japan, Lithuania, Norway, Poland, United Kingdom" "Australia, Belgium, EIRE, Italy, Japan, Lithuania, Norway, Poland, United Kingdom" "Australia, Belgium, EIRE, Italy, Japan, Lithuania, Norway, Poland, United Kingdom" ...
#>   ..$ transactionvalue_min: chr [1:6] "All" "< -20.375" "> -20.375" "> -20.375" ...
#>   ..$ transactionvalue_sd : chr [1:6] "All" "All" "< 2.55673797922722" "> 2.55673797922722" ...
#>   ..$ monetary            : chr [1:6] "All" "All" "All" "< 86.775" ...
#>  $ predicted_values     :'data.frame':   410 obs. of  2 variables:
#>   ..$ id     : chr [1:410] "12395" "12427" "12431" "12433" ...
#>   ..$ segment: int [1:410] 6 1 6 5 1 1 1 1 1 1 ...
#>  $ input_data           : tibble [410 × 8] (S3: tbl_df/tbl/data.frame)
#>   ..$ id                  : chr [1:410] "12395" "12427" "12431" "12433" ...
#>   ..$ recency             : int [1:410] 4 4 6 6 5 2 1 4 2 6 ...
#>   ..$ frequency           : int [1:410] 12 10 14 73 1 84 1 5 63 20 ...
#>   ..$ monetary            : num [1:410] 346 304 358 1919 -17 ...
#>   ..$ transactionvalue_min: num [1:410] 8.5 12.6 15 5.04 -17 ...
#>   ..$ transactionvalue_sd : num [1:410] 14.7 21.6 11.9 19.5 NA ...
#>   ..$ top_country         : chr [1:410] "Belgium" "Germany" "Australia" "Norway" ...
#>   ..$ response            : num [1:410] 26.8 24.8 26.4 26 28 ...
#>  - attr(*, "class")= chr "tree_model"