wensui님의 프로필WENSUI'S BLOG IN STATIST...사진블로그리스트기타 ![]() | 도움말 |
HOW TO CREATE A CROSS TABLE IN R#################################### # HOW TO CREATE A CROSS TABLE IN R # #################################### x <- cut(1:100, 2, ordered_result = TRUE) y <- cut(runif(100, 0, 1), 2, ordered_result = TRUE) ### WITH TABLE() FUNCTION ### table1 <- table(x, y) # y # x (0.000824,0.493] (0.493,0.985] # (0.901,50.5] 20 30 # (50.5,100] 22 28 t1 <- data.frame(table1) # x y Freq # 1 (0.901,50.5] (0.000824,0.493] 20 # 2 (50.5,100] (0.000824,0.493] 22 # 3 (0.901,50.5] (0.493,0.985] 30 # 4 (50.5,100] (0.493,0.985] 28 ### WITH FTABLE() FUNCTION ### table2 <- ftable(x, y) # y (0.000824,0.493] (0.493,0.985] # x # (0.901,50.5] 20 30 # (50.5,100] 22 28 t2 <- data.frame(table2) # x y Freq # 1 (0.901,50.5] (0.000824,0.493] 20 # 2 (50.5,100] (0.000824,0.493] 22 # 3 (0.901,50.5] (0.493,0.985] 30 # 4 (50.5,100] (0.493,0.985] 28 ### WITH XTABS() FUNCTION ### table3 <- xtabs(~x + y) # y # x (0.000824,0.493] (0.493,0.985] # (0.901,50.5] 20 30 # (50.5,100] 22 28 t3 <- data.frame(table3) # x y Freq # 1 (0.901,50.5] (0.000824,0.493] 20 # 2 (50.5,100] (0.000824,0.493] 22 # 3 (0.901,50.5] (0.493,0.985] 30 # 4 (50.5,100] (0.493,0.985] 28 EXTRACT ROWS FROM A DATA FRAME IN R> ################################################## > # EXTRACT ROWS FROM A DATA FRAME # > ################################################## > > data <- data.frame(scan(what = list(x = 0, y = 0))) 1: 1 1 2: 1 2 3: 1 3 4: 2 1 5: 2 2 6: 2 3 7: Read 6 records > > # EXTRACT ROWS WITH LOGICAL SUBSCRIPTS > rows1 <- data[data$x == 2 & data$y >= 2,] > rows1 x y 5 2 2 6 2 3 > > # EXTRACT ROWS WITH ROW INDEX > rows2 <- data[which(data$x == 2 & data$y >= 2),] > rows2 x y 5 2 2 6 2 3 > > # EXTRACT ROWS WITH SUBSET() FUNCTION > rows3 <- subset(data, x == 2 & y >= 2) > rows3 x y 5 2 2 6 2 3 > > # EXTRACT ROWS WITH SQLDF() FUNCTION > require(sqldf) > rows4 <- sqldf("select * from data where x = 2 and y >= 2", row.names = TRUE) > rows4 x y 5 2 2 6 2 3 A DEMO OF VECTOR AUTOREGRESSIVE FORECASTING MODEL################################################# # SELECT THE BEST LAG ESTIMATE REGRESSION WITH OUTLIERS*** SIMULATE DATA WITH OUTLIERS ***; data one; do i = 1 to 1000; x1 = ranuni(1); x2 = ranuni(2); if i > 900 then y = 50 + rannor(1); else y = 10 + 2 * x1 + 4 * x2 + rannor(1) * 0.5; output; end; run; *** ROBUST REGRESSION WITH M-ESTIMATION ***; proc robustreg data = one method = m; model y = x1 x2; run; /* Standard 95% Confidence Chi- Parameter DF Estimate Error Limits Square Pr > ChiSq Intercept 1 9.8867 0.0463 9.7959 9.9776 45506.8 <.0001 x1 1 2.1440 0.0591 2.0282 2.2597 1318.17 <.0001 x2 1 4.0620 0.0606 3.9433 4.1806 4500.31 <.0001 Scale 1 0.5850 */ *** GENERALIZED MAXIMUM ENTROPY ***; proc entropy data = one gmenm; model y = x1 x2; run; /* Approx Approx Variable Estimate Std Err t Value Pr > |t| x1 0.30864 0.00877 35.20 <.0001 x2 1.490858 0.0397 37.58 <.0001 Intercept 15.02985 0.3087 48.69 <.0001 */ *** RIDGE REGRESSION ***; proc reg data = one ridge = 1 to 4 by 0.1 outest = est; model y = x1 x2 / noprint; run; proc print data = est; run; /* _MODEL_ _TYPE_ _DEPVAR_ _RIDGE_ _PCOMIT_ _RMSE_ Intercept x1 x2 y MODEL1 PARMS y . . 11.0916 14.3101 0.60107 4.17812 -1 MODEL1 RIDGE y 1.0 . 11.1080 15.5109 0.28663 2.08686 -1 MODEL1 RIDGE y 1.1 . 11.1096 15.5677 0.27235 1.98739 -1 MODEL1 RIDGE y 1.2 . 11.1111 15.6192 0.25943 1.89697 -1 MODEL1 RIDGE y 1.3 . 11.1125 15.6663 0.24767 1.81442 -1 MODEL1 RIDGE y 1.4 . 11.1139 15.7094 0.23693 1.73876 -1 MODEL1 RIDGE y 1.5 . 11.1152 15.7491 0.22708 1.66915 -1 MODEL1 RIDGE y 1.6 . 11.1164 15.7856 0.21802 1.60491 -1 MODEL1 RIDGE y 1.7 . 11.1176 15.8195 0.20965 1.54542 -1 MODEL1 RIDGE y 1.8 . 11.1187 15.8509 0.20190 1.49019 -1 ... ... */ FIND BOXCOX TRANSFORMATION OF PREDICTORS WITH QLIM PROCEDURE****************************************************; *** FIND BOXCOX TRANSFORMATIONS OF X1 & X2 ***; DOWNLOAD EXCHANGE RATES FROM WWW.OANDA.COM & CALCULATE USD INDEX#################################################
Emacs - An All-in-One Intelligent Computing Interface Useful Key Bindings for Emacs Python Mode
C-M-x : Execute the current function or class C-c C-c : Execute the buffer (i.e., the file being displayed) C-c C-h : Get context-based help C-c C-s : Execute a Python command C-c C-t : Toggle shells C-c ! : Open the Python interactive shell C-c # : Comment the highlighted (marked) region C-c ? : Show Python mode documentation C-c | : Execute the highlighted (marked) part of the current program Useful Key Bindings to Run R with ESS/Emacs
C-c C-r : submit region
C-c C-l : send whole buffer (file) without echoing the lines C-c C-b : send whole buffer with echoing the lines (slower than above) C-c C-c : submit "paragraph" (contiguous chunk of code) C-c C-f : submit a Function in current buffer C-c C-j : submit the current line C-c C-v : get help on R(enter name in mini-buffer) C-c TAB : complete object/file name Figure 1: Emacs and SAS
Figure 2: Emacs and R
Figure 3: Emacs and Python
HOW TO DOWNLOAD STOCK PRICE ONLINE AND CREATE A TIME SERIES PLOT WITH R################################################# A PYTHON FUNCTION TO READ STOCK PRICE FROM FINANCE.YAHOO.COM############################################# # READ STOCK PRICE FROM FINANCE.YAHOO.COM # ############################################# import urllib from dateutil.relativedelta import * from datetime import * def GetPrice(ticker, start, end): stock = ticker.upper() m1 = int(start.split("/")[0]) d1 = int(start.split("/")[1]) y1 = int(start.split("/")[2]) dt1 = date(y1, m1, d1) + relativedelta(months = -1); a = dt1.month b = dt1.day c = dt1.year m2 = int(end.split("/")[0]) d2 = int(end.split("/")[1]) y2 = int(end.split("/")[2]) dt2 = date(y2, m2, d2) + relativedelta(months = -1); d = dt2.month e = dt2.day f = dt2.year url = "http://ichart.finance.yahoo.com/table.csv?s=" + stock + \ "&d=" + str(d) + "&e=" + str(e) + "&f=" + str(f) + \ "&g=d&a=" + str(a) + "&b=" + str(b) + "&c=" + str(c) + "&ignore=.csv" data = urllib.urlopen(url) print data.read() GetPrice("jpm", "08/01/2009", "08/10/2009") ''' Date,Open,High,Low,Close,Volume,Adj Close 2009-08-10,42.03,43.22,42.01,42.69,43700600,42.69 2009-08-07,41.28,43.13,41.18,42.36,65696000,42.36 2009-08-06,42.28,42.46,40.22,40.75,54175600,40.75 2009-08-05,40.34,42.20,40.26,41.78,63334100,41.78 2009-08-04,39.27,40.50,39.17,40.21,43363800,40.21 2009-08-03,39.12,39.75,38.99,39.60,42957800,39.60 ''' SELECTION OF OPTIMAL FREE PARAMETER IN ARTIFICIAL NEURAL NETWORKSAbstract As nonparametric statistical techniques, Artificial Neural Networks (ANNs) have received much attention in financial modeling recently. Despite their advantages, Artificial Neural Networks suffer from the problem of over-fitting. A Cross Validation method, V-Fold Cross Validation, is proposed in this project to select the optimal value of free parameter and to improve the ability of generalization in ANNs. The performance of this method is assessed using both a simulated data and a real-world Fixed Income data. In our study, V-fold Cross Validation works successfully in two paradigms of ANNs, which are Back propagation Neural Network (BPN) and Generalized Regression Neural Network (GRNN).
1. Introduction Using Artificial Neural Networks (ANNs) as an alternative to traditional modeling methodologies to develop forecasting models and conduct function approximation in finance is not new. Since 1990s, numerous papers have been dedicated to this area. Hutchinson, Lo, and Poggio (1994) have successfully applied ANNs to learn the Black-Scholes formula with three different ANNs paradigms and the results are promising despite the needs in proper inference and better performance measurements. Research interests in ANNs are also on the comparison with other statistical methods. Tang and Fishwick (1992) have experimented a broad range of economic time series with different complexities using Back Propagation Neural Networks (BPN) and reported that BPN outperforms the traditional Box-Jenkins method in time series forecasting and is much easier to use. Leung, Chen, Daouk (2000) have applied several different methods, including Back Propagation Neural Networks (BPN) and Generalized Regression Neural Networks (GRNN), to forecasting three exchange rates and found that GRNN outperforms the other methods according to the measurements of RMSE, MAE, and U statistics.
As data-driven nonparametric methods, ANNs share most of the costs and benefits with the others in the family of nonparametric models. Despite the ability to approximate any continuous function with any desired accuracy, ANNs also suffer from the over-fitting problem. Unfortunately, there is no simple method to correct this problem. One of the remedies for the over-fitting problem in ANNs is the proper selection of network parameters. In the study, I should discuss a method for this purpose, V-fold Cross Validation, and apply it to GRNN due to the simple specification of GRNN with only one free parameter. Furthermore, the same method should also be extended to BPN with a more complicated structure of parameters. The rest of this paper is organized as the following. In Section 2, two most popular paradigms of ANNs, BPN and GRNN, and V-fold Cross Validation will be briefly introduced. In Section 3, a simulation analysis will be conducted with the proposed method on GRNN and BPN respectively. In Section 4, both GRNN and BPN with V-fold Cross Validation method should be used to approximate the forward rate of US Treasury Strips.
SAS/IML routine to calculate AIC and BIC of a Logit Model******************************************; * HOW TO USE IML PROCEDURE TO CALCULATE *; * THE LIKELIHOOD FUNCTION OF A LOGIT *; * MODEL TOGETHER WITH AIC AND BIC. *; * -------------------------------------- *; * WITH THE SAME ROUTINE, AIC AND BIC *; * FROM A SEPARATE HOLDOUT DATASET CAN BE *; * CALCULATED. *; ******************************************; *** REMISSION DATA IS FROM EXAMPLES OF ***; *** LOGISTIC PROCEDURE IN SAS MANUAL ***; ods output ParameterEstimates = Parms; proc logistic data = remission desc; model remiss = li; run; /* STANDARD SAS OUTPUT AS BENCHMARK: Model Fit Statistics Intercept Intercept and Criterion Only Covariates AIC 36.372 30.073 SC 37.668 32.665 -2 Log L 34.372 26.073 */ proc iml; *** READ INPUT DATA INTO MATRIX ***; use remission; read all var{remiss li} into data; close remission; *** READ PARAMETERS INTO MATRIX ***; use parms; read all var{estimate} into beta; close parms; y = data[, 1]; x = J(nrow(y), 1, 1)||data[, 2]; p = exp(x * beta) / (1 + exp(x * beta)); ll = log(((p ## y) # ((1 - p) ## (1 - y)))[#, ]); aic = 2 * nrow(beta) - 2 * ll; bic = nrow(beta) * log(nrow(y)) - 2 * ll; print (ll||aic||bic) [colname = {"Log Likelihood", "AIC", "BIC"} format = 8.3]; quit; /* MANUALLY CALCULATED RESULT: Log Likelihood AIC BIC -13.036 30.073 32.665 */ A FUNCTION TO CALCULATE AMORTIZATION SCHEDULE################################################# # A FUNCTION TO CALCULATE AMORTIZATION SCHEDULE # ################################################# def amortization(loan, terms, rate): print "Balance".center(15) + \ "Payment".center(15) + \ "Principal_Paid".center(15) + \ "Interest_Paid".center(15) A = (loan * rate) / (1 - pow(1 + rate, -terms)) for i in xrange(terms): P = loan * pow(1 + rate, i) - A * (pow(1 + rate, i) - 1) / rate R = P * rate print str(round(P, 2)).center(15) + \ str(round(A, 2)).center(15) + \ str(round(A - R, 2)).center(15) + \ str(round(R, 2)).center(15) amortization(100, 5, 0.1) # OUTPUT: # Balance Payment Principal_Paid Interest_Paid # 100.0 26.38 16.38 10.0 # 83.62 26.38 18.02 8.36 # 65.60 26.38 19.82 6.56 # 45.78 26.38 21.80 4.58 # 23.98 26.38 23.98 2.40 GENERALIZATIONS OF GENERALIZED ADDITIVE MODELABSTRACT Famous for its easy implementation, Logistic Regression has been the primary model in credit scoring. Albeit attractively simple, it is criticized for failing to capture nonlinearity and non-monotonicity and therefore possibly not leads to satisfactory results. Introduced by Hastie and Tibshirani, Generalized Additive Model (GAM) provides the ability to detect the nonlinear and non-monotonic relationship between the risk behavior and predictors without the loss of interpretability. However, the semi-parametric nature of GAM makes it difficult to be used in a business. In this paper, we present an application of GAM in credit scoring as well as a class of hybrid models combining ideas of Logistic Regression and GAM in order to improve the generalizability of nonlinear modeling. Model performance is evaluated and compared among discussed models using the area under Receiver Operating Characteristic (ROC) curve. It is found that our proposed method is able to yield superior results in practice.
INTRODUCTION The credit score aims to evaluate the credit worthiness of a potential borrower or the default likelihood of an existing customer with the purpose of minimizing financial costs and losses due to the credit failure. While FICO score from the credit bureau has often been used directly by most small- and mid-sized banking, credit card, and lending companies, major players tend to develop the internal scoring system by their own modeling team. The development of credit scoring models has been extensively studied and is expected to be a continuously active area in the financial industry given the present turmoil of credit crisis. Fallen into the class of Generalized Linear Model (GLM), Logistic Regression is currently the Number One statistical technique in credit scoring models due to its simplicity. In recent researches, Generalized Additive Model (GAM), a more flexible statistical model with the semi-parametric functional form, has shown promising successes in the credit risk modeling by combining the flexibility with the interpretability. However, due to its semi-parametric nature, GAM has been found difficult to be understood in the business environment and be implemented in the production setting. In our paper, we should illustrate the superiority of GAM over Logistic Regression through the development of a credit risk model. Meanwhile, we would also demonstrate how to use other data mining techniques, namely Classification and Regression Tree (CART) and Multivariate Adaptive Regression Splines (MARS), to improve the usability of GAM and to bring this new modeling technique down to earth within the framework of GLM that is more familiar to the majority of our modelers ... ...
CONCLUSION In this paper, we have demonstrated a new class of nonlinear modeling techniques and its application in credit risk modeling. In our experience, GAM outperforms Logistic Regression in two aspects. First of all, GAM relaxes the linear assumption between the response and predictors and therefore avoids the potential problem of model misspecification, which often occurs in a linear-based model such as Linear Discriminant Analysis or Logistic Regression. Secondly, by incorporating nonlinear effects, GAM helps discover hidden patterns between the response and predictors and consequently improves the predictive performance albeit at the risk of over-fitting. Besides the original concept and implementation of GAM, we have also proposed two hybrid models based upon our learning experience, which are piecewise constant and piece linear approximation models. Our findings show that the hybrid model combining the flexibility of GAM and the resistance to over-fitting of CART is able to yield the best result.
A CLASS OF PREDICTIVE MODELS FOR MULTI-LEVEL RISKABSTRACT In the financial service industry, discriminant analysis and its variants based upon binary outcome, such as logistic regression or neural networks, are largely used to develop predictive models. However, the two-state assumption of such models over-simplifies customers’ behavioral outcomes and ignores the existence of multi-level risk. In many situations, the financial impact of a certain customer is directly related to the frequency and the severity of his/her adverse behaviors. Therefore, it is of interest to model and predict such multi-level risks. Several modeling techniques, from Poisson to Ordered Logit models, have been widely discussed in numerous research literatures about how to predict the multi-level risks. Our paper is also an attempt contributed to this end. Several modeling strategies together with their implementations and related scoring scheme will be illustrated. Our purpose is to demonstrate an application of these complex statistical models with the business touch and how to implement them in a production environment.
METHODOLOGY In retail banking and credit card industries, it is a key interest to predict the probability of customer’s adverse behaviors, such as delinquencies or defaults. A widely accepted practice in the industry is to classify customers into 2 groups, the good and the bad, based upon the presence of certain adverse behaviors and then to model this binary outcome with discriminant models. For instance, a customer will be classified as the bad if he/she misses payments during a valuation horizon of one year. In SAS community, most efforts contributed to the improvement of these prediction models so far have been focusing on discovering the relationship between the outcome and predictors through either parametric or nonparametric statistical methods. Li (2006) compared discriminant analysis and logistic regression in the credit risk modeling. Liu and Cela (2007) demonstrated how to use Generalized Additive Models to capture the nonlinear relationship in a credit scoring model. However, an obvious limitation of discriminant models based upon the binary outcome is that the two-state classification over-simplifies adverse behaviors of customers. To the best of our knowledge, what financially impact a financial institute are not only the presence of a certain adverse behavior but also the frequency and the severity of such behavior. As a result, it is advantageous to differentiate different levels of the risk and evaluate the probability of each risk level.
... ...
CONCLUSION In this paper, we have reviewed several modeling strategies for count data and their implementations. Basic Poisson models with and without the consideration of observed heterogeneity is a good starting point for count data modeling. For count data with the evidence of over-dispersion, negative binomial regression with a more liberal assumption on variance is able to provide a better solution. If the over-dispersion results from a high frequency of zero counts, advanced composite models such as hurdle regression, zip regression and latent class regression might give more satisfactory fit to the data. An example in credit risk assessment has been used in our paper to demonstrate the usage of various models for count data and related statistical tests. However, successful applications can also be extended to other business problems, such as database marketing, healthcare utilization, and quality control.
SAS Macro to do Monotonic WOE Transformation for Numeric PredictorsTo Modelers and ScoreCard Developers who understand WOE: 1. What does the macro do? RealMatrix++: A Real-Valued Matrix Class and LibraryRealMatrix++ A Real-Valued Matrix Class and Library Daniel Hanson Email: HoserAnalytics (at) gmail (dot) com June 1, 2009 Introduction and Purpose A frequent frustration for quant development in finance is the lack of a matrix class and matrix algebra library in the C++ Standard Library. Third party libraries do exist, both open source and commercial, but open source libraries are often time consuming to learn and implement, and commercial libraries can carry a heavy price tag. Furthermore, for quant finance, as in other domains such as predictive modeling, the vast majority of linear algebra calculations are real-valued. The intent of the RealMatrix++ Library is to provide an easy-to-use, non-templated matrix class and library for real-valued matrices and vectors, with the ultimate goal of providing functionality on par with that of the JAMA library for Java [1]. The code is not optimized for performance to the extent of, say, the Matrix Template Library [2], or Blitz [3], but it will likely offer more rapid development, and performance sufficient at least for prototyping models. Current Release Features and Design This initial release contains a real matrix class, and matrix operations +, -, and *, where (*) is overloaded for standard matrix multiplication, vector by matrix and matrix by vector multiplication, and scalar multiplication (see MatrixOperations.h for details). There is also a set of validation conditions outlined in MatrixConditions.h, which will check if two matrices are of the same dimensions (necessary for matrix addition and subtraction), if a matrix is square, or if a matrix is symmetric. The Matrix class itself is built as a wrapper around the valarray container in the C++ standard library. As Stroustrup[4] says, "The standard library does not provide a matrix class. Instead, the intent is for valarray to provide the tools for building matrices optimized for a variety of needs." Also, as Josuttis [5] points out, "a valarray can be used as a base both for vector and matrix operations.with good performance." On the other hand, he notes that "you often need some inconvenient and time-consuming type conversions." This relates primarily to the need to create a separate valarray instance to use the valarray slice function. For this reason, its use is avoided in the Matrix class here, although it may be used minimally going forward, in cases where the benefit outweighs the inconvenience. The current release has been unit tested, but not completely stress tested. It should work as designed, but at this stage it should probably be best considered a beta release. Your comments are invited (email address given at the top of this document). Future Additions/Enhancements The following functionality is being considered for future releases (not necessarily an exhaustive list): Matrix Decompositions o Cholesky Decomposition of square and symmetric matrices o Singular Value Decomposition o LU Decomposition Eigenvalue calculations Principal component analysis Solutions of linear systems Matrix inverse Licensing This software is released under the terms of the BSD License [6]: Copyright (c) 2009, Daniel Hanson All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. Neither the name of the copyright holder nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. References [1] http://math.nist.gov/javanumerics/jama/ [2] http://www.osl.iu.edu/research/mtl/ [3] http://www.oonumerics.org/blitz/ [4] Bjarne Stroustrup, The C++ Programming Language, 3E (Addison Wesley) [5] Nicolai Josuttis, The C++ Standard Library: A Tutorial and Reference (Addison Wesley) [6] http://www.opensource.org/licenses/bsd-license.php A MACRO TO IMPUTE NUMERIC VARIABLES BASED UPON BAD RATESlibname data 'E:\sas'; data test; set data.credit; if ranuni(2) < 0.1 then x2 = .; if ranuni(3) < 0.3 then x3 = .; if ranuni(4) < 0.2 then x4 = .; run; proc means data = test n nmiss nonobs; class y; var x2 - x4; run; /* N Y Variable Label N Miss 0 X2 X2 5217 591 X3 X3 4011 1797 X4 X4 4675 1133 1 X2 X2 343 29 X3 X3 268 104 X4 X4 311 61 */ %macro impute(data = , y = , x = , outfile = ); *********************************************; * THIS MACRO IS TO IMPUTE NUMERIC VARIABLES *; * BASED UPON BAD RATES *; * ----------------------------------------- *; * PARAMETERS: *; * DATA : INPUT DATA SET *; * Y : RESPONSE WITH 0 / 1 *; * X : A LIST OF NUMERIC PREDICTORS *; * WITH MISSING VALUES *; * OUTFILE: FILEREF FOR IMPUTING RECODE *; *********************************************; %if %sysfunc(fexist(&outfile)) %then %do; data _null_; rc = fdelete("&outfile"); if rc = 0 then put "THE FILE HAS BEEN DELETE."; run; %end; ods output position = _pos1; ods listing close; proc contents data = &data varnum; run; ods listing; proc sql noprint; select upcase(variable) into :x2 separated by ' ' from _pos1 where compress(upcase(type), ' ') = 'NUM' and index("%upcase(%sysfunc(compbl(&x)))", compress(upcase(variable), ' ')) > 0; select count(variable) into :xcnt from _pos1 where compress(upcase(type), ' ') = 'NUM' and index("%upcase(%sysfunc(compbl(&x)))", compress(upcase(variable), ' ')) > 0; quit; data _tmp1; retain &x2 &y; set &data; where &y in (0, 1); keep &x2 &y; run; ods output position = _pos2; ods listing close; proc contents data = _tmp1 varnum; run; ods listing; %let loop = 0; %do i = 1 %to &xcnt; proc sql noprint; select upcase(variable) into :var from _pos2 where num = &i; select count(distinct &var) into :vcnt from _tmp1 where &var ~= . and &y in (1, 0); select nmiss(&var) / count(*) * 100 into :mpct from _tmp1 where &y in (1, 0); select count(distinct &y) into :yflg from _tmp1 where &y in (1, 0); quit; %if &vcnt > 1 & &mpct > 0 & &mpct < 99 & &yflg = 2 %then %do; %let loop = %eval(&loop + 1); proc sql noprint; select mean(&y) format = 10.6 into :mis_rate from _tmp1 where &var = . and &y in (0, 1); select count(*) into :mcnt from _tmp1 where &var = . and &y in (1, 0); create table _tmp2 as select distinct "%upcase(%trim(&var))" as variable, &var as value, mean(&y) as bad_rate, &mis_rate as mis_rate, count(*) as freq, &mcnt as mis_cnt, abs(mean(&y) - &mis_rate) as dif from _tmp1 where &var ~= . and &y in (0, 1) group by &var; quit; proc sort data = _tmp2 sortsize = max; by dif descending freq; run; data _tmp3; set _tmp2; by dif descending freq; if _n_ = 1 then output; run; %if &loop = 1 %then %do; data _result; set _tmp3; run; %end; %else %then %do; data _result; set _result _tmp3; run; %end; %end; %end; data _null_; set _result end = eof; file impute mod; if _n_ = 1 then do; put @1 93 * "*" ";"; put @1 "* IMPUTED VARIABLE" @32 "| IMPUTED VALUE" @52 "| BAD RATE" @65 "| MIS. COUNT" @80 "| MIS. RATE" @93 "*;"; end; put @1 "*" 91 * "-" @93 "*;"; put @1 "*" @3 variable @32 "|" @34 value @52 "|" @54 bad_rate percent8.2 @65 "|" @67 mis_cnt @80 "|" @82 mis_rate percent8.2 @93 "*;"; if eof then do; put @1 93 * "*" ";"; end; run; data _null_; set _result; file impute mod; if _n_ = 1 then do; put " "; put @10 3 * "*" " RECODING OF IMPUTATION " 3 * "*" ";"; end; put @3 "if " @6 variable " = . " @40 "then " variable " = " value ";"; run; %mend impute; filename impute 'E:\sas\impute.sas'; %impute(data = test, y = y, x = x2 x3 x4, outfile = impute); /* $ more impute.sas **************************************************************; *IMPUTED VARIABLE|IMPUTED VALUE|BAD RATE|MIS. COUNT|MIS. RATE*; *------------------------------------------------------------*; * X2 | -0.68 | 4.55%| 620 | 4.68% *; *------------------------------------------------------------*; * X3 | -0.685 | 5.67%| 1901 | 5.47% *; *------------------------------------------------------------*; * X4 | 1.728 | 5.00%| 1194 | 5.11% *; **************************************************************; *** RECODING OF IMPUTATION ***; if X2 = . then X2 = -0.683 ; if X3 = . then X3 = -0.685 ; if X4 = . then X4 = 1.728 ; */ *** SHOW HOW TO USE THE IMPUTING RECODE ***; data test2; set test; %inc impute / source2; run; proc means data = test2 n nmiss nonobs; class y; var x2 - x4; run; /* N Y Variable Label N Miss 0 X2 X2 5808 0 X3 X3 5808 0 X4 X4 5808 0 1 X2 X2 372 0 X3 X3 372 0 X4 X4 372 0 */ READ FIXED-WIDTH DATA FILE WITH read.fwf()############################################## # READ FIXED-WIDTH DATA FILE WITH read.fwf() # # ------------------------------------------ # # EQUIVALENT SAS CODE: # # filename data 'E:\sas\fixed.txt'; # # data test; # # infile data truncover; # # input @1 city $ 1 - 22 @23 population; # # run; # ############################################## # OPEN A CONNECTION TO THE DATA FILE data <- file(description = "e:\\sas\\fixed.txt", open = "r") # width = c(...) ==> SPECIFIES COLUMN WIDTHS # col.names = c(...) ==> GIVES COLUMN NAMES # colClasses = c(...) ==> DEFINES COLUMN CLASSES test <- read.fwf(data, header = FALSE, width = c(22, 10), col.names = c("city", "population"), colClasses = c("character", "numeric")) close(data) READ DATA FROM A FILE CONNECTION WITH SCAN()################################################# # READ DATA FROM A FILE CONNECTION WITH SCAN() # #-----------------------------------------------# # EQUIVALENT SAS CODE: # # filename data 'E:\sas\Kyphosis'; # # data test; # # infile data; # # input x1 x2 x3 y @@; # # run; # ################################################# # OPEN A CONNECTION TO THE DATA FILE data <- file(description = "e:\\sas\\kyphosis", open = "r") # READ DATA FROM THE CONNECTION test <- data.frame(scan(data, what = list(x1 = 0, x2 = 0, x3 = 0, y = ""), quiet = TRUE)) # CLOSE THE CONNECTION close(data) EVALUATION OF Logistic() FUNCTION IN RWeka############################################## # EVALUATION OF Logistic() FUNCTION IN RWeka # ############################################## library(rpart) data(kyphosis) # MODEL FROM glm() FUNCTION model1 <- glm(Kyphosis ~., data = kyphosis, family = binomial()) # PRINT OUT MODEL SUMMARY summary(model1) # Estimate Std. Error z value Pr(>|z|) # (Intercept) -2.036934 1.449575 -1.405 0.15996 # Age 0.010930 0.006446 1.696 0.08996 . # Number 0.410601 0.224861 1.826 0.06785 . # Start -0.206510 0.067699 -3.050 0.00229 ** # MODEL FROM Logistic() FUNCTION IN RWeka library(RWeka) model2 <- Logistic(Kyphosis ~., data = kyphosis) # PRINT OUT ESTIMATES print(model2) # Class # Variable absent # ==================== # Age -0.0109 # Number -0.4106 # Start 0.2065 # Intercept 2.0369 # MODEL EVALUATION BASED ON 5-FOLD CROSS-VALIDATION summary(logit, newdata = kyphosis, class = TRUE, numFolds = 5, seed = 2009) # === 5 Fold Cross Validation === # === Summary === # Correctly Classified Instances 65 80.2469 % # Incorrectly Classified Instances 16 19.7531 % # Kappa statistic 0.3481 # Mean absolute error 0.2503 # Root mean squared error 0.3624 # Relative absolute error 74.2609 % # Root relative squared error 88.868 % # Total Number of Instances 81 # === Detailed Accuracy By Class === # TP Rate FP Rate Precision Recall F-Measure ROC Area Class # 0.906 0.588 0.853 0.906 0.879 0.835 absent # 0.412 0.094 0.538 0.412 0.467 0.835 present # Weighted Avg. 0.802 0.484 0.787 0.802 0.792 0.835 # === Confusion Matrix === # a b <-- classified as # 58 6 | a = absent # 10 7 | b = present |
|
|