wensui님의 프로필WENSUI'S BLOG IN STATIST...사진블로그리스트기타 도구 도움말

블로그


    HOW TO CREATE A CROSS TABLE IN R

    ####################################
    # HOW TO CREATE A CROSS TABLE IN R #
    ####################################

    <- cut(1:100, 2, ordered_result = TRUE)
    <- cut(runif(100, 0, 1), 2, ordered_result = TRUE)

    ### WITH TABLE() FUNCTION ###
    table1 <- table(x, y)

    #               y
    # x              (0.000824,0.493] (0.493,0.985]
    #   (0.901,50.5]               20            30
    #   (50.5,100]                 22            28

    t1 <- data.frame(table1)

    #              x                y Freq
    # 1 (0.901,50.5] (0.000824,0.493]   20
    # 2   (50.5,100] (0.000824,0.493]   22
    # 3 (0.901,50.5]    (0.493,0.985]   30
    # 4   (50.5,100]    (0.493,0.985]   28

    ### WITH FTABLE() FUNCTION ###
    table2 <- ftable(x, y)

    #              y (0.000824,0.493] (0.493,0.985]
    # x                                            
    # (0.901,50.5]                 20            30
    # (50.5,100]                   22            28

    t2 <- data.frame(table2)

    #              x                y Freq
    # 1 (0.901,50.5] (0.000824,0.493]   20
    # 2   (50.5,100] (0.000824,0.493]   22
    # 3 (0.901,50.5]    (0.493,0.985]   30
    # 4   (50.5,100]    (0.493,0.985]   28

    ### WITH XTABS() FUNCTION ###
    table3 <- xtabs(~x + y)

    #               y
    # x              (0.000824,0.493] (0.493,0.985]
    #   (0.901,50.5]               20            30
    #   (50.5,100]                 22            28

    t3 <- data.frame(table3)

    #              x                y Freq
    # 1 (0.901,50.5] (0.000824,0.493]   20
    # 2   (50.5,100] (0.000824,0.493]   22
    # 3 (0.901,50.5]    (0.493,0.985]   30
    # 4   (50.5,100]    (0.493,0.985]   28

    EXTRACT ROWS FROM A DATA FRAME IN R

    > ##################################################
    > #        EXTRACT ROWS FROM A DATA FRAME          #
    > ##################################################
    >
    > data <- data.frame(scan(what = list(x = 0, y = 0)))
    1: 1 1
    2: 1 2
    3: 1 3
    4: 2 1
    5: 2 2
    6: 2 3
    7:
    Read 6 records
    >
    > # EXTRACT ROWS WITH LOGICAL SUBSCRIPTS
    > rows1 <- data[data$x == 2 & data$y >= 2,]
    > rows1
      x y
    5 2 2
    6 2 3
    >
    > # EXTRACT ROWS WITH ROW INDEX
    > rows2 <- data[which(data$x == 2 & data$y >= 2),]
    > rows2
      x y
    5 2 2
    6 2 3
    >
    > # EXTRACT ROWS WITH SUBSET() FUNCTION
    > rows3 <- subset(data, x == 2 & y >= 2)
    > rows3
      x y
    5 2 2
    6 2 3
    >
    > # EXTRACT ROWS WITH SQLDF() FUNCTION
    > require(sqldf)
    > rows4 <- sqldf("select * from data where x = 2 and y >= 2", row.names = TRUE)
    > rows4
      x y
    5 2 2
    6 2 3

    A DEMO OF VECTOR AUTOREGRESSIVE FORECASTING MODEL

    #################################################
    # A STANDARD VECTOR AUTOREGRESSIVE (VAR) MODEL  #
    # TO FORECAST 3-MONTH TBILLS, 10-YEAR TBONDS,   #
    # AND STOCK-WATSON EXPERIMENTAL RECESSION INDEX #
    #################################################

    # LOAD LIBRARIES
    library(vars)
    library(fEcofin)

    # CONVERT DATA FRAME INTO TS OBJECT
    data(recession)
    ts <- ts(recession[, 3:5], start = c(1966, 1), frequency = 12)

    # PLOT SERIES
    plot(ts, main = "US Recession Data")

    # SELECT THE BEST LAG
    select <- VARselect(ts, lag.max = 12, type = "both")
    # $selection
    # AIC(n)  HQ(n)  SC(n) FPE(n)
    #    10      3      2     10

    # ESTIMATE VAR MODEL
    var <- VAR(ts, p = 2, type = "both")
    # Estimated coefficients for equation tbills3m:
    # =============================================
    # tbills3m = tbills3m.l1 + tbonds10y.l1 + xri.l1 + tbills3m.l2 + tbonds10y.l2 + xri.l2 + const + trend
    #  tbills3m.l1  tbonds10y.l1        xri.l1   tbills3m.l2  tbonds10y.l2
    # 1.1212021873  0.3505955923  1.2871875302 -0.1731594979 -0.3124729186
    #       xri.l2         const         trend
    # -1.4660556670  0.1313513828 -0.0003302518

    # Estimated coefficients for equation tbonds10y:
    # ==============================================
    # tbonds10y = tbills3m.l1 + tbonds10y.l1 + xri.l1 + tbills3m.l2 + tbonds10y.l2 + xri.l2 + const + trend
    #
    #  tbills3m.l1  tbonds10y.l1        xri.l1   tbills3m.l2  tbonds10y.l2
    # -0.0562701733  1.3349464685  0.3114932693  0.0766959847 -0.3645448725
    #       xri.l2         const         trend
    # -0.3768012352  0.1387545242 -0.0001249355

    # Estimated coefficients for equation xri:
    # ========================================
    # xri = tbills3m.l1 + tbonds10y.l1 + xri.l1 + tbills3m.l2 + tbonds10y.l2 + xri.l2 + const + trend
    #  tbills3m.l1  tbonds10y.l1        xri.l1   tbills3m.l2  tbonds10y.l2
    # 1.421629e-02 -9.035927e-04  1.078047e+00  7.673239e-03 -1.496342e-02
    #       xri.l2         const         trend
    # -2.589415e-01  2.379950e-02 -3.815981e-05

    plot(var)

    ESTIMATE REGRESSION WITH OUTLIERS

    *** SIMULATE DATA WITH OUTLIERS ***;
    data one;
      do i = 1 to 1000;
        x1 = ranuni(1);
        x2 = ranuni(2);
        if i > 900 then y = 50 + rannor(1);
        else y = 10 + 2 * x1 + 4 * x2 + rannor(1) * 0.5;
        output;
      end;
    run;

    *** ROBUST REGRESSION WITH M-ESTIMATION ***;
    proc robustreg data = one method = m;
      model y = x1 x2;
    run;
    /*
                          Standard   95% Confidence     Chi-
    Parameter DF Estimate    Error       Limits       Square Pr > ChiSq
    Intercept  1   9.8867   0.0463   9.7959   9.9776 45506.8     <.0001
    x1         1   2.1440   0.0591   2.0282   2.2597 1318.17     <.0001
    x2         1   4.0620   0.0606   3.9433   4.1806 4500.31     <.0001
    Scale      1   0.5850                                              
    */
        
    *** GENERALIZED MAXIMUM ENTROPY ***;
    proc entropy data = one gmenm;
      model y = x1 x2;
    run;  
    /*
                                  Approx                  Approx
    Variable        Estimate     Std Err    t Value     Pr > |t|
    x1               0.30864     0.00877      35.20       <.0001
    x2              1.490858      0.0397      37.58       <.0001
    Intercept       15.02985      0.3087      48.69       <.0001
    */
        
    *** RIDGE REGRESSION ***;
    proc reg data = one ridge = 1 to 4 by 0.1 outest = est;
      model y = x1 x2 / noprint;
    run;

    proc print data = est;
    run;  
    /*
    _MODEL_  _TYPE_  _DEPVAR_  _RIDGE_  _PCOMIT_   _RMSE_  Intercept     x1       x2     y
    MODEL1   PARMS      y         .         .     11.0916   14.3101   0.60107  4.17812  -1
    MODEL1   RIDGE      y        1.0        .     11.1080   15.5109   0.28663  2.08686  -1
    MODEL1   RIDGE      y        1.1        .     11.1096   15.5677   0.27235  1.98739  -1
    MODEL1   RIDGE      y        1.2        .     11.1111   15.6192   0.25943  1.89697  -1
    MODEL1   RIDGE      y        1.3        .     11.1125   15.6663   0.24767  1.81442  -1
    MODEL1   RIDGE      y        1.4        .     11.1139   15.7094   0.23693  1.73876  -1
    MODEL1   RIDGE      y        1.5        .     11.1152   15.7491   0.22708  1.66915  -1
    MODEL1   RIDGE      y        1.6        .     11.1164   15.7856   0.21802  1.60491  -1
    MODEL1   RIDGE      y        1.7        .     11.1176   15.8195   0.20965  1.54542  -1
    MODEL1   RIDGE      y        1.8        .     11.1187   15.8509   0.20190  1.49019  -1
    ... ...
    */

    FIND BOXCOX TRANSFORMATION OF PREDICTORS WITH QLIM PROCEDURE

    ****************************************************;
    * USE QLIM PROCEDURE TO FIND BOXCOX TRANSFORMATION *;
    * IN A REGRESSION MODEL:                           *;
    *  LAMBDA -> 0, BOXCOX(X) = LOG(X)                 *;
    *                           POWER(X, LAMBDA) - 1   *;
    *  LAMBDA ~= 0, BOXCOX(X) = --------------------   *;
    *                                 LAMBDA           *;
    ****************************************************;

    *** SIMULATE DATA WITH LAMBDA1 = 0 AND LAMBDA2 = 2 ***;
    data one;
      do i = 1 to 5000;
        x1 = ranuni(1);
        x2 = ranuni(2);
        y = 4 + 3 * log(x1) - 2 * ((x2 ** 2 - 1) / 2) + rannor(1) * 0.5;
        output;
      end;
    run;

    *** CHECK FUNCTIONAL FORM BY AGGREGATES OF RESIDUALS ***;
    ods graphics on;
    proc genmod data = one;
      model y = x1 x2;
      assess var = (x1) / resample = 10 seed = 1;
    run;
    ods graphics off;

    /* PARTIAL OUTPUT:
                           Assessment Summary
                   Maximum
    Assessment    Absolute                                      Pr >
    Variable         Value    Replications          Seed    MaxAbsVal
    x1             20.2347              10             1       <.0001
    */

    *** FIND BOXCOX TRANSFORMATIONS OF X1 & X2 ***;
    proc qlim data = one;
      model y = x1 x2 / boxcox(x1 = lambda_x1, x2 = lambda_x2);
    run;  

    /* PARTIAL OUTPUT:
                          Parameter Estimates
                                     Standard                 Approx
    Parameter        Estimate           Error    t Value    Pr > |t|
    Intercept        4.013770        0.026152     153.48      <.0001
    x1               3.014509        0.016633     181.23      <.0001
    x2              -1.973779        0.140882     -14.01      <.0001
    lambda_x1        0.001132        0.002546       0.44      0.6567
    lambda_x2        1.983583        0.125074      15.86      <.0001
    _Sigma           0.501839        0.005018     100.00      <.0001
    */

    data two;
      set one;
      z1 = log(x1);
      z2 = (x2 ** 1.983583 - 1) / 1.983583;
    run;

    *** REGRESSION AGAIN AFTER BOXCOX TRANSFORMATION ***;
    proc reg data = two;
      model y = z1 z2;
    run;

    /* PARTIAL OUTPUT:
                            Parameter Estimates
                         Parameter       Standard
    Variable     DF       Estimate          Error    t Value    Pr > |t|
    Intercept     1        4.01059        0.01887     212.49      <.0001
    z1            1        3.00784        0.00716     420.20      <.0001
    z2            1       -1.97355        0.04756     -41.50      <.0001
    */

    DOWNLOAD EXCHANGE RATES FROM WWW.OANDA.COM & CALCULATE USD INDEX

    #################################################
    # A PIECE OF R SNIPPET TO PULL EXCHANGE RATES   #
    # FROM WWW.OANDA.COM AND TO CALCULATE USD INDEX #
    #################################################

    library(fImport)

    # DOWNLOAD EACH COMPONENT OF USDX FROM WWW.OANDA.COM
    eur.usd <- oandaSeries("EUR/USD")
    usd.jpy <- oandaSeries("USD/JPY")
    gbp.usd <- oandaSeries("GBP/USD")
    usd.cad <- oandaSeries("USD/CAD")
    usd.sek <- oandaSeries("USD/SEK")
    usd.chf <- oandaSeries("USD/CHF")

    plot(cbind(eur.usd, usd.jpy, gbp.usd, usd.cad, usd.sek, usd.chf), 
         main = 'Components of USD Index')


    # CALCULATE USDX BY COMBINING ALL COMPONENTS
    # REFERENCE: http://en.wikipedia.org/wiki/USD_Index

    usdx <- 50.14348112 * (eur.usd ^ (-0.576)) *
            (usd.jpy ^ 0.136) * (gbp.usd ^ (-0.119)) *
            (usd.cad ^ 0.091) * (usd.sek ^ 0.042) *
            (usd.chf ^ 0.036)
    colnames(usdx) <- "USD Index"

    plot(usdx, main = "USD Index")

    Emacs - An All-in-One Intelligent Computing Interface

         Useful Key Bindings for Emacs Python Mode
    C-M-x   : Execute the current function or class
    C-c C-c : Execute the buffer (i.e., the file being displayed)
    C-c C-h : Get context-based help
    C-c C-s : Execute a Python command
    C-c C-t : Toggle shells
    C-c !   : Open the Python interactive shell
    C-c #   : Comment the highlighted (marked) region
    C-c ?   : Show Python mode documentation
    C-c |   : Execute the highlighted (marked) part of the current program
     
         Useful Key Bindings to Run R with ESS/Emacs
    C-c C-r : submit region
    C-c C-l : send whole buffer (file) without echoing the lines
    C-c C-b : send whole buffer with echoing the lines (slower than above)
    C-c C-c : submit "paragraph" (contiguous chunk of code)
    C-c C-f : submit a Function in current buffer
    C-c C-j : submit the current line
    C-c C-v : get help on R(enter name in mini-buffer)
    C-c TAB : complete object/file name
     
     
         Figure 1: Emacs and SAS
     
     
     
         Figure 2: Emacs and R
     
     
         Figure 3: Emacs and Python
     
     
         Figure 4: Emacs and Octve
     
     

    HOW TO DOWNLOAD STOCK PRICE ONLINE AND CREATE A TIME SERIES PLOT WITH R

    #################################################
    # A DEMO ON HOW TO DOWNLOAD STOCK PRICE ONLINE  #
    # AND CREATE A TIME SERIES PLOT WITH R          #
    #################################################

    library(chron)
    library(zoo)

    # STOCK TICKER OF Fifth Third Bancorp
    stock <- 'FITB'

    # DEFINE STARTING DATE
    start.date  <- 1
    start.month <- 1
    start.year  <- 2007

    # DEFINE ENDING DATE
    end.date  <- 31
    end.month <- 12
    end.year  <- 2008

    # DEFINE URL LINK
    link <- paste("http://ichart.finance.yahoo.com/table.csv?s=", stock,
                  "&a=", as.character(start.month - 1), 
                  "&b=", as.character(start.date), 
                  "&c=", as.character(start.year),
                  "&d=", as.character(end.month - 1),
                  "&e=", as.character(end.date), 
                  "&f=", as.character(end.year),
                  "&g=d&ignore=.csv", sep = '')

    # DOWNLOAD STOCK PRICE AS CSV FILE
    download.file(link, "d:/r/data.csv")

    # READ THE CSV FILE INTO R
    data <- read.csv("d:/r/data.csv")

    # CONVERT CHARACTER INTO DATE
    dt <- dates(as.character(data[, 1]), format = "y-m-d")

    # CONVERT DATA FRAME INTO TS OBJECT
    ts <- zoo(data[, 2:5], dt)

    # CREATE A PLOT FOR OPEN/CLOSE/HIGH/LOW PRICES
    plot(ts, main = stock)


    A PYTHON FUNCTION TO READ STOCK PRICE FROM FINANCE.YAHOO.COM

    #############################################
    # READ STOCK PRICE FROM FINANCE.YAHOO.COM   #
    #############################################

    import urllib
    from dateutil.relativedelta import *
    from datetime import *

    def GetPrice(ticker, start, end):
      stock = ticker.upper()

      m1 = int(start.split("/")[0])
      d1 = int(start.split("/")[1])
      y1 = int(start.split("/")[2])
      dt1 = date(y1, m1, d1) + relativedelta(months = -1);
      a = dt1.month
      b = dt1.day
      c = dt1.year

      m2 = int(end.split("/")[0])
      d2 = int(end.split("/")[1])
      y2 = int(end.split("/")[2])
      dt2 = date(y2, m2, d2) + relativedelta(months = -1);
      d = dt2.month
      e = dt2.day
      f = dt2.year

      url = "http://ichart.finance.yahoo.com/table.csv?s=" + stock + \
            "&d=" + str(d) + "&e=" + str(e) + "&f=" + str(f) +       \
            "&g=d&a=" + str(a) + "&b=" + str(b) + "&c=" + str(c) + "&ignore=.csv"
      data = urllib.urlopen(url)
      print data.read()
      
    GetPrice("jpm", "08/01/2009", "08/10/2009")

    '''
    Date,Open,High,Low,Close,Volume,Adj Close
    2009-08-10,42.03,43.22,42.01,42.69,43700600,42.69
    2009-08-07,41.28,43.13,41.18,42.36,65696000,42.36
    2009-08-06,42.28,42.46,40.22,40.75,54175600,40.75
    2009-08-05,40.34,42.20,40.26,41.78,63334100,41.78
    2009-08-04,39.27,40.50,39.17,40.21,43363800,40.21
    2009-08-03,39.12,39.75,38.99,39.60,42957800,39.60
    '''

    SELECTION OF OPTIMAL FREE PARAMETER IN ARTIFICIAL NEURAL NETWORKS

    Abstract

    As nonparametric statistical techniques, Artificial Neural Networks (ANNs) have received much attention in financial modeling recently. Despite their advantages, Artificial Neural Networks suffer from the problem of over-fitting. A Cross Validation method, V-Fold Cross Validation, is proposed in this project to select the optimal value of free parameter and to improve the ability of generalization in ANNs. The performance of this method is assessed using both a simulated data and a real-world Fixed Income data. In our study, V-fold Cross Validation works successfully in two paradigms of ANNs, which are Back propagation Neural Network (BPN) and Generalized Regression Neural Network (GRNN).

     

    1. Introduction

    Using Artificial Neural Networks (ANNs) as an alternative to traditional modeling methodologies to develop forecasting models and conduct function approximation in finance is not new. Since 1990s, numerous papers have been dedicated to this area. Hutchinson, Lo, and Poggio (1994) have successfully applied ANNs to learn the Black-Scholes formula with three different ANNs paradigms and the results are promising despite the needs in proper inference and better performance measurements. Research interests in ANNs are also on the comparison with other statistical methods. Tang and Fishwick (1992) have experimented a broad range of economic time series with different complexities using Back Propagation Neural Networks (BPN) and reported that BPN outperforms the traditional Box-Jenkins method in time series forecasting and is much easier to use. Leung, Chen, Daouk (2000) have applied several different methods, including Back Propagation Neural Networks (BPN) and Generalized Regression Neural Networks (GRNN), to forecasting three exchange rates and found that GRNN outperforms the other methods according to the measurements of RMSE, MAE, and U statistics.

     

    As data-driven nonparametric methods, ANNs share most of the costs and benefits with the others in the family of nonparametric models. Despite the ability to approximate any continuous function with any desired accuracy, ANNs also suffer from the over-fitting problem. Unfortunately, there is no simple method to correct this problem. One of the remedies for the over-fitting problem in ANNs is the proper selection of network parameters. In the study, I should discuss a method for this purpose, V-fold Cross Validation, and apply it to GRNN due to the simple specification of GRNN with only one free parameter. Furthermore, the same method should also be extended to BPN with a more complicated structure of parameters. The rest of this paper is organized as the following. In Section 2, two most popular paradigms of ANNs, BPN and GRNN, and V-fold Cross Validation will be briefly introduced. In Section 3, a simulation analysis will be conducted with the proposed method on GRNN and BPN respectively. In Section 4, both GRNN and BPN with V-fold Cross Validation method should be used to approximate the forward rate of US Treasury Strips.

     

    SAS/IML routine to calculate AIC and BIC of a Logit Model

    ******************************************;
    * HOW TO USE IML PROCEDURE TO CALCULATE  *;
    * THE LIKELIHOOD FUNCTION OF A LOGIT     *;
    * MODEL TOGETHER WITH AIC AND BIC.       *;
    * -------------------------------------- *;
    * WITH THE SAME ROUTINE, AIC AND BIC     *;
    * FROM A SEPARATE HOLDOUT DATASET CAN BE *;
    * CALCULATED.                            *;
    ******************************************;

    *** REMISSION DATA IS FROM EXAMPLES OF ***;
    *** LOGISTIC PROCEDURE IN SAS MANUAL   ***;

    ods output ParameterEstimates = Parms;
    proc logistic data = remission desc;
      model remiss = li;
    run;

    /* STANDARD SAS OUTPUT AS BENCHMARK:
             Model Fit Statistics
                                 Intercept
                  Intercept            and
    Criterion          Only     Covariates

    AIC              36.372         30.073
    SC               37.668         32.665
    -2 Log L         34.372         26.073
    */
        
    proc iml;
      *** READ INPUT DATA INTO MATRIX ***;
      use remission;
      read all var{remiss li} into data;
      close remission;

      *** READ PARAMETERS INTO MATRIX ***;
      use parms;
      read all var{estimate} into beta;
      close parms;
      
      y   = data[, 1];
      x   = J(nrow(y), 1, 1)||data[, 2];
      p   = exp(x * beta) / (1 + exp(x * beta));
      ll  = log(((p ## y) # ((1 - p) ## (1 - y)))[#, ]);
      aic = 2 * nrow(beta) - 2 * ll;
      bic = nrow(beta) * log(nrow(y)) - 2 * ll;
      print (ll||aic||bic)
            [colname = {"Log Likelihood", "AIC", "BIC"}
             format  = 8.3];
    quit;

    /* MANUALLY CALCULATED RESULT:
    Log Likelihood      AIC      BIC

           -13.036   30.073   32.665
    */

    A FUNCTION TO CALCULATE AMORTIZATION SCHEDULE

    #################################################
    # A FUNCTION TO CALCULATE AMORTIZATION SCHEDULE #
    #################################################

    def amortization(loan, terms, rate):
      print "Balance".center(15)        + \
            "Payment".center(15)        + \
            "Principal_Paid".center(15) + \
            "Interest_Paid".center(15)  
      A = (loan * rate) / (1 - pow(1 + rate, -terms))
      for i in xrange(terms):
        P = loan * pow(1 + rate, i) - A * (pow(1 + rate, i) - 1) / rate
        R = P * rate
        print str(round(P, 2)).center(15)     + \
              str(round(A, 2)).center(15)     + \
              str(round(A - R, 2)).center(15) + \
              str(round(R, 2)).center(15)

    amortization(100, 5, 0.1)

    # OUTPUT:
    #    Balance        Payment     Principal_Paid Interest_Paid
    #     100.0          26.38          16.38           10.0    
    #     83.62          26.38          18.02           8.36    
    #     65.60          26.38          19.82           6.56    
    #     45.78          26.38          21.80           4.58    
    #     23.98          26.38          23.98           2.40

    GENERALIZATIONS OF GENERALIZED ADDITIVE MODEL

    ABSTRACT

    Famous for its easy implementation, Logistic Regression has been the primary model in credit scoring. Albeit attractively simple, it is criticized for failing to capture nonlinearity and non-monotonicity and therefore possibly not leads to satisfactory results. Introduced by Hastie and Tibshirani, Generalized Additive Model (GAM) provides the ability to detect the nonlinear and non-monotonic relationship between the risk behavior and predictors without the loss of interpretability. However, the semi-parametric nature of GAM makes it difficult to be used in a business. In this paper, we present an application of GAM in credit scoring as well as a class of hybrid models combining ideas of Logistic Regression and GAM in order to improve the generalizability of nonlinear modeling. Model performance is evaluated and compared among discussed models using the area under Receiver Operating Characteristic (ROC) curve. It is found that our proposed method is able to yield superior results in practice.

     

    INTRODUCTION

    The credit score aims to evaluate the credit worthiness of a potential borrower or the default likelihood of an existing customer with the purpose of minimizing financial costs and losses due to the credit failure. While FICO score from the credit bureau has often been used directly by most small- and mid-sized banking, credit card, and lending companies, major players tend to develop the internal scoring system by their own modeling team. The development of credit scoring models has been extensively studied and is expected to be a continuously active area in the financial industry given the present turmoil of credit crisis. Fallen into the class of Generalized Linear Model (GLM), Logistic Regression is currently the Number One statistical technique in credit scoring models due to its simplicity. In recent researches, Generalized Additive Model (GAM), a more flexible statistical model with the semi-parametric functional form, has shown promising successes in the credit risk modeling by combining the flexibility with the interpretability. However, due to its semi-parametric nature, GAM has been found difficult to be understood in the business environment and be implemented in the production setting. In our paper, we should illustrate the superiority of GAM over Logistic Regression through the development of a credit risk model. Meanwhile, we would also demonstrate how to use other data mining techniques, namely Classification and Regression Tree (CART) and Multivariate Adaptive Regression Splines (MARS), to improve the usability of GAM and to bring this new modeling technique down to earth within the framework of GLM that is more familiar to the majority of our modelers

    ... ...

     

    CONCLUSION

    In this paper, we have demonstrated a new class of nonlinear modeling techniques and its application in credit risk modeling. In our experience, GAM outperforms Logistic Regression in two aspects. First of all, GAM relaxes the linear assumption between the response and predictors and therefore avoids the potential problem of model misspecification, which often occurs in a linear-based model such as Linear Discriminant Analysis or Logistic Regression. Secondly, by incorporating nonlinear effects, GAM helps discover hidden patterns between the response and predictors and consequently improves the predictive performance albeit at the risk of over-fitting. Besides the original concept and implementation of GAM, we have also proposed two hybrid models based upon our learning experience, which are piecewise constant and piece linear approximation models. Our findings show that the hybrid model combining the flexibility of GAM and the resistance to over-fitting of CART is able to yield the best result.

     

    A CLASS OF PREDICTIVE MODELS FOR MULTI-LEVEL RISK

    ABSTRACT

    In the financial service industry, discriminant analysis and its variants based upon binary outcome, such as logistic regression or neural networks, are largely used to develop predictive models. However, the two-state assumption of such models over-simplifies customers’ behavioral outcomes and ignores the existence of multi-level risk. In many situations, the financial impact of a certain customer is directly related to the frequency and the severity of his/her adverse behaviors. Therefore, it is of interest to model and predict such multi-level risks. Several modeling techniques, from Poisson to Ordered Logit models, have been widely discussed in numerous research literatures about how to predict the multi-level risks. Our paper is also an attempt contributed to this end. Several modeling strategies together with their implementations and related scoring scheme will be illustrated. Our purpose is to demonstrate an application of these complex statistical models with the business touch and how to implement them in a production environment.

     

    METHODOLOGY

    In retail banking and credit card industries, it is a key interest to predict the probability of customer’s adverse behaviors, such as delinquencies or defaults. A widely accepted practice in the industry is to classify customers into 2 groups, the good and the bad, based upon the presence of certain adverse behaviors and then to model this binary outcome with discriminant models. For instance, a customer will be classified as the bad if he/she misses payments during a valuation horizon of one year. In SAS community, most efforts contributed to the improvement of these prediction models so far have been focusing on discovering the relationship between the outcome and predictors through either parametric or nonparametric statistical methods. Li (2006) compared discriminant analysis and logistic regression in the credit risk modeling. Liu and Cela (2007) demonstrated how to use Generalized Additive Models to capture the nonlinear relationship in a credit scoring model. However, an obvious limitation of discriminant models based upon the binary outcome is that the two-state classification over-simplifies adverse behaviors of customers. To the best of our knowledge, what financially impact a financial institute are not only the presence of a certain adverse behavior but also the frequency and the severity of such behavior. As a result, it is advantageous to differentiate different levels of the risk and evaluate the probability of each risk level.

     

    ... ...

     

    CONCLUSION

    In this paper, we have reviewed several modeling strategies for count data and their implementations. Basic Poisson models with and without the consideration of observed heterogeneity is a good starting point for count data modeling. For count data with the evidence of over-dispersion, negative binomial regression with a more liberal assumption on variance is able to provide a better solution. If the over-dispersion results from a high frequency of zero counts, advanced composite models such as hurdle regression, zip regression and latent class regression might give more satisfactory fit to the data. An example in credit risk assessment has been used in our paper to demonstrate the usage of various models for count data and related statistical tests. However, successful applications can also be extended to other business problems, such as database marketing, healthcare utilization, and quality control.

      

    SAS Macro to do Monotonic WOE Transformation for Numeric Predictors

    To Modelers and ScoreCard Developers who understand WOE:

    1. What does the macro do?
       A. (guaranteed) monotonic WOE transformation/binning for numeric predictors
       B. calculate KS and Information Value of each Predictor
       C. generate recoding SAS file of WOE transformation
       D. generate recoding SAS files of binning (format and put statement)


    2. How to use this compiled macro?
       A. save this macro to your macro library
       B. use following SAS code to call the macro
       ========================================
       options mstored sasstore = yourlib;
       libname yourlib 'C:\yourpath';
       %woe(data = yourdata, y = y, x = x2 x3 x4 x5);

    3. A peek into this macro:

       %macro woe(data = , y = , x = );
       *********************************************************;
       * THIS MACRO IS DESIGNED TO DO MONOTONIC WEIGHT OF      *;
       * EVIDENCE TRANSFORMATION FOR SCORECARD DEVELOPMENT     *;
       * IN A PRODUCTION ENVIRONMENT.                          *;
       * ----------------------------------------------------- *;
       * INPUT PARAMETERS:                                     *;
       *  DATA: INPUT DATA                                     *;
       *  Y   : BAD INDICATOR WITH 0/1 (BAD ==> 1)             *;
       *  X   : A LIST OF NUMERIC PREDICTORS                   *;    
       * ----------------------------------------------------- *;
       * DEFAULT OUTPUT FILES:                                 *;
       *  .WOE: WOE RECODING FILE TO BE USED IN DATA STEP      *;
       *  .FMT: RECODING FILE TO BE USED IN PROC FORMAT        *;
       *  .PUT: BINNING RECODING FILE WITH PUT STATEMENT       *;
       *  .SUM: VARIABLES LIST RANKED BY KS AND INFO. VALUE    *;
       *  .OUT: OUTPUT FILE WITH WOE BINNING SUMMARY           *;    
       *********************************************************;

       ......

       *********************************************************;
       * END OF THE WHOLE MACRO                                *;
       *********************************************************;
       %mend woe;

    4. One more note:
         Special thanks go to my friend Michael Murff, Manager
       of risk detection at Paypal. This macro was written with
       his encouragement and help.
      
    5. Please feel free to contact me for details:
       Phone: 513-393-9898
       Email: liuwensui@gmail.com


    RealMatrix++: A Real-Valued Matrix Class and Library

    RealMatrix++
    A Real-Valued Matrix Class and Library

    Daniel Hanson
    Email: HoserAnalytics (at) gmail (dot) com
    June 1, 2009

    Introduction and Purpose

    A frequent frustration for quant development in finance is the lack of a matrix class and matrix algebra library in the C++ Standard Library.  Third party libraries do exist, both open source and commercial, but open source libraries are often time consuming to learn and implement, and commercial libraries can carry a heavy price tag.  Furthermore, for quant finance, as in other domains such as predictive modeling, the vast majority of linear algebra calculations are real-valued.

    The intent of the RealMatrix++ Library is to provide an easy-to-use, non-templated matrix class and library for real-valued matrices and vectors, with the ultimate goal of providing functionality on par with that of the JAMA library for Java [1].  The code is not optimized for performance to the extent of, say, the Matrix Template Library [2], or Blitz [3], but it will likely offer more rapid development, and performance sufficient at least for prototyping models.

    Current Release Features and Design

    This initial release contains a real matrix class, and matrix operations +, -, and *, where (*) is overloaded for standard matrix multiplication, vector by matrix and matrix by vector multiplication, and scalar multiplication (see MatrixOperations.h for details).

    There is also a set of validation conditions outlined in MatrixConditions.h, which will check if two matrices are of the same dimensions (necessary for matrix addition and subtraction), if a matrix is square, or if a matrix is symmetric.

    The Matrix class itself is built as a wrapper around the valarray container in the C++ standard library.  As Stroustrup[4] says, "The standard library does not provide a matrix class.  Instead, the intent is for valarray to provide the tools for building matrices optimized for a variety of needs."

    Also, as Josuttis [5] points out, "a valarray can be used as a base both for vector and matrix operations.with good performance."  On the other hand, he notes that "you often need some inconvenient and time-consuming type conversions." This relates primarily to the need to create a separate valarray instance to use the valarray slice function.  For this reason, its use is avoided in the Matrix class here, although it may be used minimally going forward, in cases where the benefit outweighs the inconvenience.

    The current release has been unit tested, but not completely stress tested.  It should work as designed, but at this stage it should probably be best considered a beta release. Your comments are invited (email address given at the top of this document).

    Future Additions/Enhancements

    The following functionality is being considered for future
    releases (not necessarily an exhaustive list):
            Matrix Decompositions
            o    Cholesky Decomposition of square and symmetric matrices
            o    Singular Value Decomposition
            o    LU Decomposition
            Eigenvalue calculations
            Principal component analysis
            Solutions of linear systems
            Matrix inverse

    Licensing

    This software is released under the terms of the BSD License [6]:

    Copyright (c) 2009, Daniel Hanson
    All rights reserved.

    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

    Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.  Neither the name of the copyright holder nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.

    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT  SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

    References

    [1]
    http://math.nist.gov/javanumerics/jama/
    [2] http://www.osl.iu.edu/research/mtl/
    [3] http://www.oonumerics.org/blitz/
    [4] Bjarne Stroustrup, The C++ Programming Language, 3E (Addison Wesley)
    [5] Nicolai Josuttis, The C++ Standard Library: A Tutorial and Reference  (Addison Wesley)
    [6]
    http://www.opensource.org/licenses/bsd-license.php

    A MACRO TO IMPUTE NUMERIC VARIABLES BASED UPON BAD RATES

    libname data 'E:\sas';

    data test;
      set data.credit;
      if ranuni(2) < 0.1 then x2 = .;
      if ranuni(3) < 0.3 then x3 = .;
      if ranuni(4) < 0.2 then x4 = .;
    run;

    proc means data = test n nmiss nonobs;
      class y;
      var x2 - x4;
    run;  

    /*                                       N
         Y    Variable    Label       N    Miss
         0    X2          X2       5217     591
              X3          X3       4011    1797
              X4          X4       4675    1133
         1    X2          X2        343      29
              X3          X3        268     104
              X4          X4        311      61
    */
        
    %macro impute(data = , y = , x = , outfile = );
    *********************************************;
    * THIS MACRO IS TO IMPUTE NUMERIC VARIABLES *;
    * BASED UPON BAD RATES                      *;
    * ----------------------------------------- *;
    * PARAMETERS:                               *;
    *  DATA   : INPUT DATA SET                  *;
    *  Y      : RESPONSE WITH 0 / 1             *;
    *  X      : A LIST OF NUMERIC PREDICTORS    *;
    *           WITH MISSING VALUES             *;    
    *  OUTFILE: FILEREF FOR IMPUTING RECODE     *;
    *********************************************;
        
    %if %sysfunc(fexist(&outfile)) %then %do;
      data _null_;
        rc = fdelete("&outfile");
        if rc = 0 then put "THE FILE HAS BEEN DELETE.";
      run;
    %end;

    ods output position = _pos1;
    ods listing close;
    proc contents data = &data varnum;
    run;
    ods listing;

    proc sql noprint;
      select
        upcase(variable) into :x2 separated by ' '
      from
        _pos1
      where
        compress(upcase(type), ' ') = 'NUM' and
        index("%upcase(%sysfunc(compbl(&x)))", compress(upcase(variable), ' ')) > 0;

      select
        count(variable) into :xcnt
      from
        _pos1
      where
        compress(upcase(type), ' ') = 'NUM' and
        index("%upcase(%sysfunc(compbl(&x)))", compress(upcase(variable), ' ')) > 0;    
    quit;

    data _tmp1;
      retain &x2 &y;
      set &data;
      where &y in (0, 1);
      keep &x2 &y;
    run;  

    ods output position = _pos2;
    ods listing close;
    proc contents data = _tmp1 varnum;
    run;
    ods listing;

    %let loop = 0;

    %do i = 1 %to &xcnt;
      proc sql noprint;
        select
          upcase(variable) into :var
        from
          _pos2
        where
          num = &i;

        select
          count(distinct &var) into :vcnt
        from
          _tmp1
        where
          &var ~= . and
          &y in (1, 0);

        select
          nmiss(&var) / count(*) * 100 into :mpct
        from
          _tmp1
        where
          &y in (1, 0);

        select
          count(distinct &y) into :yflg
        from
          _tmp1
        where
          &y in (1, 0);
      quit;
      
      %if &vcnt > 1 & &mpct > 0 & &mpct < 99 & &yflg = 2 %then %do;        
        %let loop = %eval(&loop + 1);
        
        proc sql noprint;
          select
            mean(&y) format = 10.6 into :mis_rate
          from
            _tmp1
          where
            &var = .     and
            &y in (0, 1);

          select
            count(*) into :mcnt
          from
            _tmp1
          where
            &var = .     and
            &y in (1, 0);

          create table
            _tmp2 as
          select distinct
            "%upcase(%trim(&var))"    as variable,
            &var                      as value,
            mean(&y)                  as bad_rate,
            &mis_rate                 as mis_rate,
            count(*)                  as freq,
            &mcnt                     as mis_cnt,
            abs(mean(&y) - &mis_rate) as dif
          from
            _tmp1
          where
            &var ~= .    and
            &y in (0, 1)
          group by
            &var;
        quit;

        proc sort data = _tmp2 sortsize = max;
          by dif descending freq;
        run;

        data _tmp3;
          set _tmp2;
          by dif descending freq;
          if _n_ = 1 then output;
        run;

        %if &loop = 1 %then %do;
          data _result;
            set _tmp3;
          run;
        %end;
        %else %then %do;
          data _result;
            set _result _tmp3;
          run;
        %end;    
      %end;        
    %end;    

    data _null_;
      set _result end = eof;
      file impute mod;
      if _n_ = 1 then do;
        put @1 93 * "*" ";";
        put @1 "* IMPUTED VARIABLE" @32 "| IMPUTED VALUE" @52 "| BAD RATE"
            @65 "| MIS. COUNT" @80 "| MIS. RATE" @93 "*;";
      end;
      put @1 "*" 91 * "-" @93 "*;";
      put @1 "*"  @3 variable @32 "|" @34 value @52 "|" @54 bad_rate percent8.2
          @65 "|" @67 mis_cnt @80 "|" @82 mis_rate percent8.2 @93 "*;";
      if eof then do;
        put @1 93 * "*" ";";
      end;  
    run;

    data _null_;
      set _result;
      file impute mod;
      if _n_ = 1 then do;
        put " ";
        put @10 3 * "*" " RECODING OF IMPUTATION " 3 * "*" ";";
      end;
      put @3  "if " @6 variable " = . " @40 "then " variable
          " = " value ";";
    run;

    %mend impute;

    filename impute 'E:\sas\impute.sas';

    %impute(data = test, y = y, x = x2 x3 x4, outfile = impute);

    /*
    $ more impute.sas
    **************************************************************;
    *IMPUTED VARIABLE|IMPUTED VALUE|BAD RATE|MIS. COUNT|MIS. RATE*;
    *------------------------------------------------------------*;
    * X2             | -0.68       |   4.55%| 620      |   4.68% *;
    *------------------------------------------------------------*;
    * X3             | -0.685      |   5.67%| 1901     |   5.47% *;
    *------------------------------------------------------------*;
    * X4             | 1.728       |   5.00%| 1194     |   5.11% *;
    **************************************************************;
     
             *** RECODING OF IMPUTATION ***;
      if X2  = .                           then X2  = -0.683 ;
      if X3  = .                           then X3  = -0.685 ;
      if X4  = .                           then X4  = 1.728 ;
    */

    *** SHOW HOW TO USE THE IMPUTING RECODE ***;
    data test2;
      set test;
      %inc impute / source2;
    run;

    proc means data = test2 n nmiss nonobs;
      class y;
      var x2 - x4;
    run;  

    /*                                       N
         Y    Variable    Label       N    Miss
         0    X2          X2       5808       0
              X3          X3       5808       0
              X4          X4       5808       0
         1    X2          X2        372       0
              X3          X3        372       0
              X4          X4        372       0
    */

    READ FIXED-WIDTH DATA FILE WITH read.fwf()

    ##############################################
    # READ FIXED-WIDTH DATA FILE WITH read.fwf() #
    # ------------------------------------------ #
    # EQUIVALENT SAS CODE:                       #
    # filename data 'E:\sas\fixed.txt';          #
    # data test;                                 #
    #   infile data truncover;                   #
    #   input @1 city $ 1 - 22 @23 population;   #
    # run;                                       #
    ##############################################

    # OPEN A CONNECTION TO THE DATA FILE
    data <- file(description = "e:\\sas\\fixed.txt", open = "r")

    # width = c(...)      ==> SPECIFIES COLUMN WIDTHS
    # col.names = c(...)  ==> GIVES COLUMN NAMES
    # colClasses = c(...) ==> DEFINES COLUMN CLASSES
    test <- read.fwf(data, header = FALSE, width = c(22, 10), 
                     col.names = c("city", "population"),
                     colClasses = c("character", "numeric"))

    close(data)

    READ DATA FROM A FILE CONNECTION WITH SCAN()

    #################################################
    # READ DATA FROM A FILE CONNECTION WITH SCAN()  #
    #-----------------------------------------------#
    # EQUIVALENT SAS CODE:                          #
    # filename data 'E:\sas\Kyphosis';              #
    # data test;                                    #
    #   infile data;                                #
    #   input x1 x2 x3 y @@;                        #
    # run;                                          #
    #################################################

    # OPEN A CONNECTION TO THE DATA FILE
    data <- file(description = "e:\\sas\\kyphosis", open = "r")

    # READ DATA FROM THE CONNECTION
    test <- data.frame(scan(data, what = list(x1 = 0, x2 = 0, x3 = 0, y = ""),
                            quiet = TRUE))

    # CLOSE THE CONNECTION
    close(data) 

    EVALUATION OF Logistic() FUNCTION IN RWeka

    ##############################################
    # EVALUATION OF Logistic() FUNCTION IN RWeka #
    ##############################################

    library(rpart)
    data(kyphosis)

    # MODEL FROM glm() FUNCTION
    model1 <- glm(Kyphosis ~., data = kyphosis, family = binomial())

    # PRINT OUT MODEL SUMMARY
    summary(model1)

    #               Estimate Std. Error z value Pr(>|z|)  
    #  (Intercept) -2.036934   1.449575  -1.405  0.15996  
    #  Age          0.010930   0.006446   1.696  0.08996 .
    #  Number       0.410601   0.224861   1.826  0.06785 .
    #  Start       -0.206510   0.067699  -3.050  0.00229 **

    # MODEL FROM Logistic() FUNCTION IN RWeka
    library(RWeka)
    model2 <- Logistic(Kyphosis ~., data = kyphosis)

    # PRINT OUT ESTIMATES
    print(model2)

    #                Class
    # Variable      absent
    # ====================
    # Age          -0.0109
    # Number       -0.4106
    # Start         0.2065
    # Intercept     2.0369

    # MODEL EVALUATION BASED ON 5-FOLD CROSS-VALIDATION
    summary(logit, newdata = kyphosis, class = TRUE, numFolds = 5, seed = 2009)

    # === 5 Fold Cross Validation ===
    # === Summary ===
    # Correctly Classified Instances          65               80.2469 %
    # Incorrectly Classified Instances        16               19.7531 %
    # Kappa statistic                          0.3481
    # Mean absolute error                      0.2503
    # Root mean squared error                  0.3624
    # Relative absolute error                 74.2609 %
    # Root relative squared error             88.868  %
    # Total Number of Instances               81    
    # === Detailed Accuracy By Class ===
    #                TP Rate   FP Rate   Precision   Recall  F-Measure   ROC Area  Class
    #                  0.906     0.588      0.853     0.906     0.879      0.835    absent
    #                  0.412     0.094      0.538     0.412     0.467      0.835    present
    # Weighted Avg.    0.802     0.484      0.787     0.802     0.792      0.835
    # === Confusion Matrix ===
    #   a  b   <-- classified as
    #  58  6 |  a = absent
    #  10  7 |  b = present