diff --git a/CHANGELOG b/CHANGELOG index db4bf8fb0..758da2167 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,7 +1,7 @@ #Nmap Changelog ($Id$); -*-text-*- o Upgrade included libraries: Lua 5.4.6, libpcre2 10.43, zlib 1.3.1, - libssh2 1.11.0 + libssh2 1.11.0, liblinear 2.47 o [Zenmap][GH#2739] Fix a crash in Zenmap when changing a host comment. diff --git a/checklibs.sh b/checklibs.sh index 1b7708bf0..c24835418 100644 --- a/checklibs.sh +++ b/checklibs.sh @@ -69,9 +69,16 @@ EOC check_liblinear() { LINEAR_SOURCE="https://www.csie.ntu.edu.tw/~cjlin/liblinear/" - echo "Can't check liblinear, no version information is available" - LINEAR_LATEST=$(curl -Ls $LINEAR_SOURCE | perl -lne 'if(/The current release \(([^)]+)\) of LIBLINEAR/){print $1;exit 0}') - echo " Latest:" $LINEAR_LATEST + LINEAR_VERSION=$(awk '$2=="LIBLINEAR_VERSION"{print$3;exit}' $NDIR/liblinear/linear.h | sed 's/./&./1') + LINEAR_LATEST=$(curl -Ls $LINEAR_SOURCE | perl -lne 'if(/liblinear-([\d.]+).tar.gz/){print $1}' | newest) + if [ "$LINEAR_VERSION" != "$LINEAR_LATEST" ]; then + echo "Newer version of liblinear available" + echo " Current:" $LINEAR_VERSION + echo " Latest: " $LINEAR_LATEST + echo " Source: $LINEAR_SOURCE" + else + echo "liblinear: $LINEAR_VERSION" + fi } check_zlib() { diff --git a/liblinear/COPYRIGHT b/liblinear/COPYRIGHT index 71c0ad7f9..4e09b34da 100644 --- a/liblinear/COPYRIGHT +++ b/liblinear/COPYRIGHT @@ -1,5 +1,5 @@ -Copyright (c) 2007-2011 The LIBLINEAR Project. +Copyright (c) 2007-2023 The LIBLINEAR Project. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/liblinear/Makefile b/liblinear/Makefile index 40af81a67..065571e85 100644 --- a/liblinear/Makefile +++ b/liblinear/Makefile @@ -2,35 +2,42 @@ CXX ?= g++ CC ?= gcc CFLAGS = -Wall -Wconversion -O3 -fPIC LIBS = blas/blas.a -SHVER = 1 +#LIBS = -lblas +SHVER = 5 AR = ar RANLIB = ranlib -#LIBS = -lblas +OS = $(shell uname) +ifeq ($(OS),Darwin) + SHARED_LIB_FLAG = -dynamiclib -Wl,-install_name,liblinear.so.$(SHVER) +else + SHARED_LIB_FLAG = -shared -Wl,-soname,liblinear.so.$(SHVER) +endif all: train predict -lib: linear.o tron.o blas/blas.a - $(CXX) -shared -dynamiclib linear.o tron.o blas/blas.a -o liblinear.so.$(SHVER) +lib: linear.o newton.o blas/blas.a + $(CXX) $(SHARED_LIB_FLAG) linear.o newton.o blas/blas.a -o liblinear.so.$(SHVER) -liblinear.a: linear.o tron.o blas/blas.a - $(AR) rcv liblinear.a linear.o tron.o blas/*.o +liblinear.a: linear.o newton.o blas/blas.a + $(AR) rcv liblinear.a linear.o newton.o blas/*.o $(RANLIB) liblinear.a -train: tron.o linear.o train.c blas/blas.a - $(CXX) $(CFLAGS) -o train train.c tron.o linear.o $(LIBS) +train: newton.o linear.o train.c blas/blas.a + $(CXX) $(CFLAGS) -o train train.c newton.o linear.o $(LIBS) -predict: tron.o linear.o predict.c blas/blas.a - $(CXX) $(CFLAGS) -o predict predict.c tron.o linear.o $(LIBS) +predict: newton.o linear.o predict.c blas/blas.a + $(CXX) $(CFLAGS) -o predict predict.c newton.o linear.o $(LIBS) -tron.o: tron.cpp tron.h - $(CXX) $(CFLAGS) -c -o tron.o tron.cpp +newton.o: newton.cpp newton.h + $(CXX) $(CFLAGS) -c -o newton.o newton.cpp linear.o: linear.cpp linear.h $(CXX) $(CFLAGS) -c -o linear.o linear.cpp -blas/blas.a: - cd blas; $(MAKE) OPTFLAGS='$(CFLAGS)' CC='$(CC)'; +blas/blas.a: blas/*.c blas/*.h + make -C blas OPTFLAGS='$(CFLAGS)' CC='$(CC)'; clean: - cd blas; $(MAKE) clean - rm -f *~ tron.o linear.o train predict liblinear.so.$(SHVER) liblinear.a + make -C blas clean + make -C matlab clean + rm -f *~ newton.o linear.o train predict liblinear.so.$(SHVER) diff --git a/liblinear/Makefile.win b/liblinear/Makefile.win index c5c721fae..1b199701c 100644 --- a/liblinear/Makefile.win +++ b/liblinear/Makefile.win @@ -1,30 +1,24 @@ -#You must ensure nmake.exe, cl.exe, link.exe are in system path. -#VCVARS32.bat -#Under dosbox prompt -#nmake -f Makefile.win - -########################################## -CXXC = cl.exe -CFLAGS = -nologo -O2 -EHsc -I. -D __WIN32__ -D _CRT_SECURE_NO_DEPRECATE +CXX = cl.exe +CFLAGS = /nologo /O2 /EHsc /I. /D _WIN64 /D _CRT_SECURE_NO_DEPRECATE TARGET = windows -all: $(TARGET)\train.exe $(TARGET)\predict.exe +all: $(TARGET)\train.exe $(TARGET)\predict.exe lib -$(TARGET)\train.exe: tron.obj linear.obj train.c blas\*.c - $(CXX) $(CFLAGS) -Fe$(TARGET)\train.exe tron.obj linear.obj train.c blas\*.c +$(TARGET)\train.exe: newton.obj linear.obj train.c blas\*.c + $(CXX) $(CFLAGS) -Fe$(TARGET)\train.exe newton.obj linear.obj train.c blas\*.c -$(TARGET)\predict.exe: tron.obj linear.obj predict.c blas\*.c - $(CXX) $(CFLAGS) -Fe$(TARGET)\predict.exe tron.obj linear.obj predict.c blas\*.c +$(TARGET)\predict.exe: newton.obj linear.obj predict.c blas\*.c + $(CXX) $(CFLAGS) -Fe$(TARGET)\predict.exe newton.obj linear.obj predict.c blas\*.c linear.obj: linear.cpp linear.h $(CXX) $(CFLAGS) -c linear.cpp -tron.obj: tron.cpp tron.h - $(CXX) $(CFLAGS) -c tron.cpp +newton.obj: newton.cpp newton.h + $(CXX) $(CFLAGS) -c newton.cpp -lib: linear.cpp linear.h linear.def tron.obj - $(CXX) $(CFLAGS) -LD linear.cpp tron.obj blas\*.c -Fe$(TARGET)\liblinear -link -DEF:linear.def +lib: linear.cpp linear.h linear.def newton.obj + $(CXX) $(CFLAGS) -LD linear.cpp newton.obj blas\*.c -Fe$(TARGET)\liblinear -link -DEF:linear.def clean: - -erase /Q *.obj $(TARGET)\. + -erase /Q *.obj $(TARGET)\*.exe $(TARGET)\*.dll $(TARGET)\*.exp $(TARGET)\*.lib diff --git a/liblinear/README b/liblinear/README index 4e45c2e35..0c4338542 100644 --- a/liblinear/README +++ b/liblinear/README @@ -1,8 +1,10 @@ -LIBLINEAR is a simple package for solving large-scale regularized -linear classification. It currently supports L2-regularized logistic -regression/L2-loss support vector classification/L1-loss support vector -classification, and L1-regularized L2-loss support vector classification/ -logistic regression. This document explains the usage of LIBLINEAR. +LIBLINEAR is a simple package for solving large-scale regularized linear +classification, regression and outlier detection. It currently supports +- L2-regularized logistic regression/L2-loss support vector classification/L1-loss support vector classification +- L1-regularized L2-loss support vector classification/L1-regularized logistic regression +- L2-regularized L2-loss support vector regression/L1-loss support vector regression +- one-class support vector machine. +This document explains the usage of LIBLINEAR. To get started, please read the ``Quick Start'' section first. For developers, please check the ``Library Usage'' section to learn @@ -16,20 +18,21 @@ Table of Contents - Installation - `train' Usage - `predict' Usage +- `svm-scale' Usage - Examples - Library Usage - Building Windows Binaries -- Additional Information - MATLAB/OCTAVE interface -- PYTHON interface +- Python Interface +- Additional Information When to use LIBLINEAR but not LIBSVM ==================================== There are some large data for which with/without nonlinear mappings gives similar performances. Without using kernels, one can -efficiently train a much larger set via a linear classifier. These -data usually have a large number of features. Document classification +efficiently train a much larger set via linear classification/regression. +These data usually have a large number of features. Document classification is an example. Warning: While generally liblinear is very fast, its default solver @@ -74,8 +77,8 @@ sparse data, use `-l 0' to keep the sparsity. Installation ============ -On Unix systems, type `make' to build the `train' and `predict' -programs. Run them without arguments to show the usages. +On Unix systems, type `make' to build the `train', `predict', +and `svm-scale' programs. Run them without arguments to show the usages. On other systems, consult `Makefile' to build them (e.g., see 'Building Windows binaries' in this file) or use the pre-built @@ -85,11 +88,13 @@ This software uses some level-1 BLAS subroutines. The needed functions are included in this package. If a BLAS library is available on your machine, you may use it by modifying the Makefile: Unmark the following line - #LIBS ?= -lblas + #LIBS = -lblas and mark - LIBS ?= blas/blas.a + LIBS = blas/blas.a + +The tool `svm-scale', borrowed from LIBSVM, is for scaling input data file. `train' Usage ============= @@ -97,33 +102,55 @@ and mark Usage: train [options] training_set_file [model_file] options: -s type : set type of solver (default 1) - 0 -- L2-regularized logistic regression (primal) - 1 -- L2-regularized L2-loss support vector classification (dual) - 2 -- L2-regularized L2-loss support vector classification (primal) - 3 -- L2-regularized L1-loss support vector classification (dual) - 4 -- multi-class support vector classification by Crammer and Singer - 5 -- L1-regularized L2-loss support vector classification - 6 -- L1-regularized logistic regression - 7 -- L2-regularized logistic regression (dual) + for multi-class classification + 0 -- L2-regularized logistic regression (primal) + 1 -- L2-regularized L2-loss support vector classification (dual) + 2 -- L2-regularized L2-loss support vector classification (primal) + 3 -- L2-regularized L1-loss support vector classification (dual) + 4 -- support vector classification by Crammer and Singer + 5 -- L1-regularized L2-loss support vector classification + 6 -- L1-regularized logistic regression + 7 -- L2-regularized logistic regression (dual) + for regression + 11 -- L2-regularized L2-loss support vector regression (primal) + 12 -- L2-regularized L2-loss support vector regression (dual) + 13 -- L2-regularized L1-loss support vector regression (dual) + for outlier detection + 21 -- one-class support vector machine (dual) -c cost : set the parameter C (default 1) +-p epsilon : set the epsilon in loss function of epsilon-SVR (default 0.1) +-n nu : set the parameter nu of one-class SVM (default 0.5) -e epsilon : set tolerance of termination criterion -s 0 and 2 |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2, where f is the primal function and pos/neg are # of positive/negative data (default 0.01) - -s 1, 3, 4 and 7 - Dual maximal violation <= eps; similar to libsvm (default 0.1) + -s 11 + |f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.0001) + -s 1, 3, 4, 7, and 21 + Dual maximal violation <= eps; similar to libsvm (default 0.1 except 0.01 for -s 21) -s 5 and 6 - |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf, + |f'(w)|_1 <= eps*min(pos,neg)/l*|f'(w0)|_1, where f is the primal function (default 0.01) + -s 12 and 13 + |f'(alpha)|_1 <= eps |f'(alpha0)|, + where f is the dual function (default 0.1) -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1) +-R : not regularize the bias; must with -B 1 to have the bias; DON'T use this unless you know what it is + (for -s 0, 2, 5, 6, 11) -wi weight: weights adjust the parameter C of different classes (see README for details) -v n: n-fold cross validation mode +-C : find parameters (C for -s 0, 2 and C, p for -s 11) -q : quiet mode (no outputs) Option -v randomly splits the data into n parts and calculates cross validation accuracy on them. +Option -C conducts cross validation under different parameters and finds +the best one. This option is supported only by -s 0, -s 2 (for finding +C) and -s 11 (for finding C, p). If the solver is not specified, -s 2 +is used. + Formulations: For L2-regularized logistic regression (-s 0), we solve @@ -152,23 +179,57 @@ For L1-regularized logistic regression (-s 6), we solve min_w \sum |w_j| + C \sum log(1 + exp(-y_i w^Tx_i)) -where - -Q is a matrix with Q_ij = y_i y_j x_i^T x_j. - For L2-regularized logistic regression (-s 7), we solve min_alpha 0.5(alpha^T Q alpha) + \sum alpha_i*log(alpha_i) + \sum (C-alpha_i)*log(C-alpha_i) - a constant s.t. 0 <= alpha_i <= C, -If bias >= 0, w becomes [w; w_{n+1}] and x becomes [x; bias]. +where + +Q is a matrix with Q_ij = y_i y_j x_i^T x_j. + +For L2-regularized L2-loss SVR (-s 11), we solve + +min_w w^Tw/2 + C \sum max(0, |y_i-w^Tx_i|-epsilon)^2 + +For L2-regularized L2-loss SVR dual (-s 12), we solve + +min_beta 0.5(beta^T (Q + lambda I/2/C) beta) - y^T beta + \sum |beta_i| + +For L2-regularized L1-loss SVR dual (-s 13), we solve + +min_beta 0.5(beta^T Q beta) - y^T beta + \sum |beta_i| + s.t. -C <= beta_i <= C, + +where + +Q is a matrix with Q_ij = x_i^T x_j. + +For one-class SVM dual (-s 21), we solve + +min_alpha 0.5(alpha^T Q alpha) + s.t. 0 <= alpha_i <= 1 and \sum alpha_i = nu*l, + +where + +Q is a matrix with Q_ij = x_i^T x_j. + +If bias >= 0, w becomes [w; w_{n+1}] and x becomes [x; bias]. For +example, L2-regularized logistic regression (-s 0) becomes + +min_w w^Tw/2 + (w_{n+1})^2/2 + C \sum log(1 + exp(-y_i [w; w_{n+1}]^T[x_i; bias])) + +Some may prefer not having (w_{n+1})^2/2 (i.e., bias variable not +regularized). For primal solvers (-s 0, 2, 5, 6, 11), we provide an +option -R to remove (w_{n+1})^2/2. However, -R is generally not needed +as for most data with/without (w_{n+1})^2/2 give similar performances. The primal-dual relationship implies that -s 1 and -s 2 give the same -model, and -s 0 and -s 7 give the same. +model, -s 0 and -s 7 give the same, and -s 11 and -s 12 give the same. -We implement 1-vs-the rest multi-class strategy. In training i -vs. non_i, their C parameters are (weight from -wi)*C and C, -respectively. If there are only two classes, we train only one +We implement 1-vs-the rest multi-class strategy for classification. +In training i vs. non_i, their C parameters are (weight from -wi)*C +and C, respectively. If there are only two classes, we train only one model. Thus weight1*C vs. weight2*C is used. See examples below. We also implement multi-class SVM by Crammer and Singer (-s 4): @@ -193,7 +254,16 @@ and C^m_i = C if m = y_i, Usage: predict [options] test_file model_file output_file options: --b probability_estimates: whether to predict probability estimates, 0 or 1 (default 0) +-b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only +-q : quiet mode (no outputs) + +Note that -b is only needed in the prediction phase. This is different +from the setting of LIBSVM. + +`svm-scale' Usage +================= + +See LIBSVM README. Examples ======== @@ -206,12 +276,38 @@ Train linear SVM with L2-loss function. Train a logistic regression model. +> train -s 21 -n 0.1 data_file + +Train a linear one-class SVM which selects roughly 10% data as outliers. + > train -v 5 -e 0.001 data_file -Do five-fold cross-validation using L2-loss svm. +Do five-fold cross-validation using L2-loss SVM. Use a smaller stopping tolerance 0.001 than the default 0.1 if you want more accurate solutions. +> train -C data_file +... +Best C = 0.000488281 CV accuracy = 83.3333% +> train -c 0.000488281 data_file + +Conduct cross validation many times by L2-loss SVM and find the +parameter C which achieves the best cross validation accuracy. Then +use the selected C to train the data for getting a model. + +> train -C -s 0 -v 3 -c 0.5 -e 0.0001 data_file + +For parameter selection by -C, users can specify other +solvers (currently -s 0, -s 2 and -s 11 are supported) and +different number of CV folds. Further, users can use +the -c option to specify the smallest C value of the +search range. This option is useful when users want to +rerun the parameter selection procedure from a specified +C under a different setting, such as a stricter stopping +tolerance -e 0.0001 in the above example. Similarly, for +-s 11, users can use the -p option to specify the +maximal p value of the search range. + > train -c 10 -w1 2 -w2 5 -w3 2 four_class_data_file Train four classifiers: @@ -233,18 +329,24 @@ Output probability estimates (for logistic regression only). Library Usage ============= +These functions and structures are declared in the header file `linear.h'. +You can see `train.c' and `predict.c' for examples showing how to use them. +We define LIBLINEAR_VERSION and declare `extern int liblinear_version; ' +in linear.h, so you can check the version number. + - Function: model* train(const struct problem *prob, const struct parameter *param); - This function constructs and returns a linear classification model - according to the given training data and parameters. + This function constructs and returns a linear classification + or regression model according to the given training data and + parameters. struct problem describes the problem: struct problem { int l, n; - int *y; + double *y; struct feature_node **x; double bias; }; @@ -252,10 +354,10 @@ Library Usage where `l' is the number of training data. If bias >= 0, we assume that one additional feature is added to the end of each data instance. `n' is the number of feature (including the bias feature - if bias >= 0). `y' is an array containing the target values. And - `x' is an array of pointers, - each of which points to a sparse representation (array of feature_node) of one - training vector. + if bias >= 0). `y' is an array containing the target values. (integers + in classification, real numbers in regression) And `x' is an array + of pointers, each of which points to a sparse representation (array + of feature_node) of one training vector. For example, if we have the following training data: @@ -280,32 +382,44 @@ Library Usage [ ] -> (2,0.1) (4,1.4) (5,0.5) (6,1) (-1,?) [ ] -> (1,-0.1) (2,-0.2) (3,0.1) (4,1.1) (5,0.1) (6,1) (-1,?) - struct parameter describes the parameters of a linear classification model: + struct parameter describes the parameters of a linear classification + or regression model: struct parameter { int solver_type; /* these are for training only */ - double eps; /* stopping criteria */ + double eps; /* stopping tolerance */ double C; + double nu; /* one-class SVM only */ int nr_weight; int *weight_label; double* weight; + double p; + double *init_sol; }; - solver_type can be one of L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL. - + solver_type can be one of L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL, L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL, ONECLASS_SVM. + for classification L2R_LR L2-regularized logistic regression (primal) L2R_L2LOSS_SVC_DUAL L2-regularized L2-loss support vector classification (dual) L2R_L2LOSS_SVC L2-regularized L2-loss support vector classification (primal) L2R_L1LOSS_SVC_DUAL L2-regularized L1-loss support vector classification (dual) - MCSVM_CS multi-class support vector classification by Crammer and Singer + MCSVM_CS support vector classification by Crammer and Singer L1R_L2LOSS_SVC L1-regularized L2-loss support vector classification L1R_LR L1-regularized logistic regression L2R_LR_DUAL L2-regularized logistic regression (dual) + for regression + L2R_L2LOSS_SVR L2-regularized L2-loss support vector regression (primal) + L2R_L2LOSS_SVR_DUAL L2-regularized L2-loss support vector regression (dual) + L2R_L1LOSS_SVR_DUAL L2-regularized L1-loss support vector regression (dual) + for outlier detection + ONECLASS_SVM one-class support vector machine (dual) C is the cost of constraints violation. + p is the sensitiveness of loss of support vector regression. + nu in ONECLASS_SVM approximates the fraction of data as outliers. eps is the stopping criterion. nr_weight, weight_label, and weight are used to change the penalty @@ -320,6 +434,10 @@ Library Usage If you do not want to change penalty for any of the classes, just set nr_weight to 0. + init_sol includes the initial weight vectors (supported for only some + solvers). See the explanation of the vector w in the model + structure. + *NOTE* To avoid wrong parameters, check_parameter() should be called before train(). @@ -333,13 +451,21 @@ Library Usage double *w; int *label; /* label of each class */ double bias; + double rho; /* one-class SVM only */ }; param describes the parameters used to obtain the model. - nr_class and nr_feature are the number of classes and features, respectively. + nr_class is the number of classes for classification. It is a + non-negative integer with special cases of 0 (no training data at + all) and 1 (all training data in one class). For regression and + one-class SVM, nr_class = 2. - The nr_feature*nr_class array w gives feature weights. We use one + nr_feature is the number of features. + + The array w gives feature weights. Its size is + nr_feature*nr_class but is nr_feature if nr_class = 2 and the + solver is not MCSVM_CS (see more explanation below). We use one against the rest for multi-class classification, so each feature index corresponds to nr_class weight values. Weights are organized in the following way @@ -349,13 +475,19 @@ Library Usage | for 1st feature | for 2nd feature | +------------------+------------------+------------+ + The array label stores class labels. + + When nr_class = 1 or 2, classification solvers (MCSVM_CS + excluded) return a single vector of weights by considering + label[0] as positive in training. + If bias >= 0, x becomes [x; bias]. The number of features is increased by one, so w is a (nr_feature+1)*nr_class array. The value of bias is stored in the variable bias. - The array label stores class labels. + rho is the bias term used in one-class SVM only. -- Function: void cross_validation(const problem *prob, const parameter *param, int nr_fold, int *target); +- Function: void cross_validation(const problem *prob, const parameter *param, int nr_fold, double *target); This function conducts cross validation. Data are separated to nr_fold folds. Under given parameters, sequentially each fold is @@ -365,23 +497,53 @@ Library Usage The format of prob is same as that for train(). -- Function: int predict(const model *model_, const feature_node *x); +- Function: void find_parameters(const struct problem *prob, + const struct parameter *param, int nr_fold, double start_C, + double start_p, double *best_C, double *best_p, double *best_score); - This functions classifies a test vector using the given - model. The predicted label is returned. + This function is similar to cross_validation. However, instead of + conducting cross validation under specified parameters. For -s 0, 2, it + conducts cross validation many times under parameters C = start_C, + 2*start_C, 4*start_C, 8*start_C, ..., and finds the best one with + the highest cross validation accuracy. For -s 11, it conducts cross + validation many times with a two-fold loop. The outer loop considers a + default sequence of p = 19/20*max_p, ..., 1/20*max_p, 0 and + under each p value the inner loop considers a sequence of parameters + C = start_C, 2*start_C, 4*start_C, ..., and finds the best one with the + lowest mean squared error. -- Function: int predict_values(const struct model *model_, + If start_C <= 0, then this procedure calculates a small enough C + for prob as the start_C. The procedure stops when the models of + all folds become stable or C reaches max_C. + + If start_p <= 0, then this procedure calculates a maximal p for prob as + the start_p. Otherwise, the procedure starts with the first + i/20*max_p <= start_p so the outer sequence is i/20*max_p, + (i-1)/20*max_p, ..., 0. + + The best C, the best p, and the corresponding accuracy (or MSE) are + assigned to *best_C, *best_p and *best_score, respectively. For + classification, *best_p is not used, and the returned value is -1. + +- Function: double predict(const model *model_, const feature_node *x); + + For a classification model, the predicted class for x is returned. + For a regression model, the function value of x calculated using + the model is returned. + +- Function: double predict_values(const struct model *model_, const struct feature_node *x, double* dec_values); - This function gives nr_w decision values in the array - dec_values. nr_w is 1 if there are two classes except multi-class - svm by Crammer and Singer (-s 4), and is the number of classes otherwise. + This function gives nr_w decision values in the array dec_values. + nr_w=1 if regression is applied or the number of classes is two. An exception is + multi-class SVM by Crammer and Singer (-s 4), where nr_w = 2 if there are two classes. For all other situations, nr_w is the + number of classes. - We implement one-vs-the rest multi-class strategy (-s 0,1,2,3) and - multi-class svm by Crammer and Singer (-s 4) for multi-class SVM. + We implement one-vs-the rest multi-class strategy (-s 0,1,2,3,5,6,7) + and multi-class SVM by Crammer and Singer (-s 4) for multi-class SVM. The class with the highest decision value is returned. -- Function: int predict_probability(const struct model *model_, +- Function: double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates); This function gives nr_class probability estimates in the array @@ -397,10 +559,36 @@ Library Usage - Function: int get_nr_class(const model *model_); The function gives the number of classes of the model. + For a regression model, 2 is returned. - Function: void get_labels(const model *model_, int* label); This function outputs the name of labels into an array called label. + For a regression model, label is unchanged. + +- Function: double get_decfun_coef(const struct model *model_, int feat_idx, + int label_idx); + + This function gives the coefficient for the feature with feature index = + feat_idx and the class with label index = label_idx. Note that feat_idx + starts from 1, while label_idx starts from 0. If feat_idx is not in the + valid range (1 to nr_feature), then a zero value will be returned. For + classification models, if label_idx is not in the valid range (0 to + nr_class-1), then a zero value will be returned; for regression models + and one-class SVM models, label_idx is ignored. + +- Function: double get_decfun_bias(const struct model *model_, int label_idx); + + This function gives the bias term corresponding to the class with the + label_idx. For classification models, if label_idx is not in a valid range + (0 to nr_class-1), then a zero value will be returned; for regression + models, label_idx is ignored. This function cannot be called for a one-class + SVM model. + +- Function: double get_decfun_rho(const struct model *model_); + + This function gives rho, the bias term used in one-class SVM only. This + function can only be called for a one-class SVM model. - Function: const char *check_parameter(const struct problem *prob, const struct parameter *param); @@ -410,6 +598,21 @@ Library Usage train() and cross_validation(). It returns NULL if the parameters are feasible, otherwise an error message is returned. +- Function: int check_probability_model(const struct model *model); + + This function returns 1 if the model supports probability output; + otherwise, it returns 0. + +- Function: int check_regression_model(const struct model *model); + + This function returns 1 if the model is a regression model; otherwise + it returns 0. + +- Function: int check_oneclass_model(const struct model *model); + + This function returns 1 if the model is a one-class SVM model; otherwise + it returns 0. + - Function: int save_model(const char *model_file_name, const struct model *model_); @@ -437,19 +640,19 @@ Library Usage - Function: void set_print_string_function(void (*print_func)(const char *)); Users can specify their output format by a function. Use - set_print_string_function(NULL); + set_print_string_function(NULL); for default printing to stdout. Building Windows Binaries ========================= -Windows binaries are in the directory `windows'. To build them via -Visual C++, use the following steps: +Windows binaries are available in the directory `windows'. To re-build +them via Visual C++, use the following steps: 1. Open a dos command box and change to liblinear directory. If environment variables of VC++ have not been set, type -"C:\Program Files\Microsoft Visual Studio 10.0\VC\bin\vcvars32.bat" +"C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat" You may have to modify the above command according which version of VC++ or where it is installed. @@ -458,13 +661,20 @@ VC++ or where it is installed. nmake -f Makefile.win clean all +3. (optional) To build shared library liblinear.dll, type + +nmake -f Makefile.win lib + +4. (Optional) To build 32-bit windows binaries, you must + (1) Setup "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars32.bat" instead of vcvars64.bat + (2) Change CFLAGS in Makefile.win: /D _WIN64 to /D _WIN32 MATLAB/OCTAVE Interface ======================= Please check the file README in the directory `matlab'. -PYTHON Interface +Python Interface ================ Please check the file README in the directory `python'. diff --git a/liblinear/blas/Makefile b/liblinear/blas/Makefile index 255d574d5..e0293a41c 100644 --- a/liblinear/blas/Makefile +++ b/liblinear/blas/Makefile @@ -1,14 +1,14 @@ -AR = ar -RANLIB = ranlib +AR ?= ar +RANLIB ?= ranlib -HEADERS = blas.h blas.h blasp.h -FILES = dnrm2.o daxpy.o ddot.o dscal.o +HEADERS = blas.h blasp.h +FILES = dnrm2.o daxpy.o ddot.o dscal.o -CFLAGS = $(OPTFLAGS) +CFLAGS = $(OPTFLAGS) FFLAGS = $(OPTFLAGS) blas: $(FILES) $(HEADERS) - $(AR) rcv blas.a $(FILES) + $(AR) rcv blas.a $(FILES) $(RANLIB) blas.a clean: diff --git a/liblinear/blas/blasp.h b/liblinear/blas/blasp.h index 745836db8..d0fe8eca0 100644 --- a/liblinear/blas/blasp.h +++ b/liblinear/blas/blasp.h @@ -3,6 +3,10 @@ /* Functions listed in alphabetical order */ +#ifdef __cplusplus +extern "C" { +#endif + #ifdef F2C_COMPAT void cdotc_(fcomplex *dotval, int *n, fcomplex *cx, int *incx, @@ -166,7 +170,7 @@ int dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *beta, double *c, int *ldc); int dgemv_(char *trans, int *m, int *n, double *alpha, double *a, - int *lda, double *x, int *incx, double *beta, double *y, + int *lda, double *x, int *incx, double *beta, double *y, int *incy); int dger_(int *m, int *n, double *alpha, double *x, int *incx, @@ -178,7 +182,7 @@ int drot_(int *n, double *sx, int *incx, double *sy, int *incy, int drotg_(double *sa, double *sb, double *c, double *s); int dsbmv_(char *uplo, int *n, int *k, double *alpha, double *a, - int *lda, double *x, int *incx, double *beta, double *y, + int *lda, double *x, int *incx, double *beta, double *y, int *incy); int dscal_(int *n, double *sa, double *sx, int *incx); @@ -227,14 +231,14 @@ int dtpsv_(char *uplo, char *trans, char *diag, int *n, double *ap, double *x, int *incx); int dtrmm_(char *side, char *uplo, char *transa, char *diag, int *m, - int *n, double *alpha, double *a, int *lda, double *b, + int *n, double *alpha, double *a, int *lda, double *b, int *ldb); int dtrmv_(char *uplo, char *trans, char *diag, int *n, double *a, int *lda, double *x, int *incx); int dtrsm_(char *side, char *uplo, char *transa, char *diag, int *m, - int *n, double *alpha, double *a, int *lda, double *b, + int *n, double *alpha, double *a, int *lda, double *b, int *ldb); int dtrsv_(char *uplo, char *trans, char *diag, int *n, double *a, @@ -254,7 +258,7 @@ int sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *beta, float *c, int *ldc); int sgemv_(char *trans, int *m, int *n, float *alpha, float *a, - int *lda, float *x, int *incx, float *beta, float *y, + int *lda, float *x, int *incx, float *beta, float *y, int *incy); int sger_(int *m, int *n, float *alpha, float *x, int *incx, @@ -266,7 +270,7 @@ int srot_(int *n, float *sx, int *incx, float *sy, int *incy, int srotg_(float *sa, float *sb, float *c, float *s); int ssbmv_(char *uplo, int *n, int *k, float *alpha, float *a, - int *lda, float *x, int *incx, float *beta, float *y, + int *lda, float *x, int *incx, float *beta, float *y, int *incy); int sscal_(int *n, float *sa, float *sx, int *incx); @@ -315,14 +319,14 @@ int stpsv_(char *uplo, char *trans, char *diag, int *n, float *ap, float *x, int *incx); int strmm_(char *side, char *uplo, char *transa, char *diag, int *m, - int *n, float *alpha, float *a, int *lda, float *b, + int *n, float *alpha, float *a, int *lda, float *b, int *ldb); int strmv_(char *uplo, char *trans, char *diag, int *n, float *a, int *lda, float *x, int *incx); int strsm_(char *side, char *uplo, char *transa, char *diag, int *m, - int *n, float *alpha, float *a, int *lda, float *b, + int *n, float *alpha, float *a, int *lda, float *b, int *ldb); int strsv_(char *uplo, char *trans, char *diag, int *n, float *a, @@ -428,3 +432,7 @@ int ztrsm_(char *side, char *uplo, char *transa, char *diag, int *m, int ztrsv_(char *uplo, char *trans, char *diag, int *n, dcomplex *a, int *lda, dcomplex *x, int *incx); + +#ifdef __cplusplus +} +#endif diff --git a/liblinear/blas/daxpy.c b/liblinear/blas/daxpy.c index 58f345a53..0a3c5942b 100644 --- a/liblinear/blas/daxpy.c +++ b/liblinear/blas/daxpy.c @@ -1,14 +1,18 @@ #include "blas.h" +#ifdef __cplusplus +extern "C" { +#endif + int daxpy_(int *n, double *sa, double *sx, int *incx, double *sy, int *incy) { long int i, m, ix, iy, nn, iincx, iincy; register double ssa; - /* constant times a vector plus a vector. - uses unrolled loop for increments equal to one. - jack dongarra, linpack, 3/11/78. + /* constant times a vector plus a vector. + uses unrolled loop for increments equal to one. + jack dongarra, linpack, 3/11/78. modified 12/3/93, array(1) declarations changed to array(*) */ /* Dereference inputs */ @@ -47,3 +51,7 @@ int daxpy_(int *n, double *sa, double *sx, int *incx, double *sy, return 0; } /* daxpy_ */ + +#ifdef __cplusplus +} +#endif diff --git a/liblinear/blas/ddot.c b/liblinear/blas/ddot.c index a64a2808f..294f4ed84 100644 --- a/liblinear/blas/ddot.c +++ b/liblinear/blas/ddot.c @@ -1,14 +1,18 @@ #include "blas.h" +#ifdef __cplusplus +extern "C" { +#endif + double ddot_(int *n, double *sx, int *incx, double *sy, int *incy) { long int i, m, nn, iincx, iincy; double stemp; long int ix, iy; - /* forms the dot product of two vectors. - uses unrolled loops for increments equal to one. - jack dongarra, linpack, 3/11/78. + /* forms the dot product of two vectors. + uses unrolled loops for increments equal to one. + jack dongarra, linpack, 3/11/78. modified 12/3/93, array(1) declarations changed to array(*) */ /* Dereference inputs */ @@ -48,3 +52,7 @@ double ddot_(int *n, double *sx, int *incx, double *sy, int *incy) return stemp; } /* ddot_ */ + +#ifdef __cplusplus +} +#endif diff --git a/liblinear/blas/dnrm2.c b/liblinear/blas/dnrm2.c index e50cdf777..d80b1a5be 100644 --- a/liblinear/blas/dnrm2.c +++ b/liblinear/blas/dnrm2.c @@ -1,18 +1,22 @@ #include /* Needed for fabs() and sqrt() */ #include "blas.h" +#ifdef __cplusplus +extern "C" { +#endif + double dnrm2_(int *n, double *x, int *incx) { long int ix, nn, iincx; double norm, scale, absxi, ssq, temp; -/* DNRM2 returns the euclidean norm of a vector via the function - name, so that +/* DNRM2 returns the euclidean norm of a vector via the function + name, so that - DNRM2 := sqrt( x'*x ) + DNRM2 := sqrt( x'*x ) - -- This version written on 25-October-1982. - Modified on 14-October-1993 to inline the call to SLASSQ. + -- This version written on 25-October-1982. + Modified on 14-October-1993 to inline the call to SLASSQ. Sven Hammarling, Nag Ltd. */ /* Dereference inputs */ @@ -24,13 +28,13 @@ double dnrm2_(int *n, double *x, int *incx) if (nn == 1) { norm = fabs(x[0]); - } + } else { scale = 0.0; ssq = 1.0; - /* The following loop is equivalent to this call to the LAPACK + /* The following loop is equivalent to this call to the LAPACK auxiliary routine: CALL SLASSQ( N, X, INCX, SCALE, SSQ ) */ for (ix=(nn-1)*iincx; ix>=0; ix-=iincx) @@ -60,3 +64,7 @@ double dnrm2_(int *n, double *x, int *incx) return norm; } /* dnrm2_ */ + +#ifdef __cplusplus +} +#endif diff --git a/liblinear/blas/dscal.c b/liblinear/blas/dscal.c index a0eca0c39..5edbe8b25 100644 --- a/liblinear/blas/dscal.c +++ b/liblinear/blas/dscal.c @@ -1,14 +1,18 @@ #include "blas.h" +#ifdef __cplusplus +extern "C" { +#endif + int dscal_(int *n, double *sa, double *sx, int *incx) { long int i, m, nincx, nn, iincx; double ssa; - /* scales a vector by a constant. - uses unrolled loops for increment equal to 1. - jack dongarra, linpack, 3/11/78. - modified 3/93 to return if incx .le. 0. + /* scales a vector by a constant. + uses unrolled loops for increment equal to 1. + jack dongarra, linpack, 3/11/78. + modified 3/93 to return if incx .le. 0. modified 12/3/93, array(1) declarations changed to array(*) */ /* Dereference inputs */ @@ -42,3 +46,7 @@ int dscal_(int *n, double *sa, double *sx, int *incx) return 0; } /* dscal_ */ + +#ifdef __cplusplus +} +#endif diff --git a/liblinear/liblinear.vcxproj b/liblinear/liblinear.vcxproj index 3158c031f..2a2beef08 100755 --- a/liblinear/liblinear.vcxproj +++ b/liblinear/liblinear.vcxproj @@ -16,13 +16,13 @@ - + - + {A7BE3D76-F20C-40C5-8986-DE4028B3B57D} @@ -96,4 +96,4 @@ - \ No newline at end of file + diff --git a/liblinear/linear.cpp b/liblinear/linear.cpp index 152370373..70b79270a 100644 --- a/liblinear/linear.cpp +++ b/liblinear/linear.cpp @@ -3,8 +3,10 @@ #include #include #include +#include #include "linear.h" -#include "tron.h" +#include "newton.h" +int liblinear_version = LIBLINEAR_VERSION; typedef signed char schar; template static inline void swap(T& x, T& y) { T t=x; x=y; y=t; } #ifndef min @@ -14,18 +16,19 @@ template static inline T min(T x,T y) { return (x static inline T max(T x,T y) { return (x>y)?x:y; } #endif template static inline void clone(T*& dst, S* src, int n) -{ +{ dst = new T[n]; memcpy((void *)dst,(void *)src,sizeof(T)*n); } -#define Malloc(type,n) (type *)malloc((n)*sizeof(type)) #define INF HUGE_VAL +#define Malloc(type,n) (type *)malloc((n)*sizeof(type)) static void print_string_stdout(const char *s) { fputs(s,stdout); fflush(stdout); } +static void print_null(const char *s) {} static void (*liblinear_print_string) (const char *) = &print_string_stdout; @@ -42,143 +45,205 @@ static void info(const char *fmt,...) #else static void info(const char *fmt,...) {} #endif - -class l2r_lr_fun : public function +class sparse_operator { public: - l2r_lr_fun(const problem *prob, double Cp, double Cn); - ~l2r_lr_fun(); + static double nrm2_sq(const feature_node *x) + { + double ret = 0; + while(x->index != -1) + { + ret += x->value*x->value; + x++; + } + return ret; + } + + static double dot(const double *s, const feature_node *x) + { + double ret = 0; + while(x->index != -1) + { + ret += s[x->index-1]*x->value; + x++; + } + return ret; + } + + static double sparse_dot(const feature_node *x1, const feature_node *x2) + { + double ret = 0; + while(x1->index != -1 && x2->index != -1) + { + if(x1->index == x2->index) + { + ret += x1->value * x2->value; + ++x1; + ++x2; + } + else + { + if(x1->index > x2->index) + ++x2; + else + ++x1; + } + } + return ret; + } + + static void axpy(const double a, const feature_node *x, double *y) + { + while(x->index != -1) + { + y[x->index-1] += a*x->value; + x++; + } + } +}; + +// L2-regularized empirical risk minimization +// min_w w^Tw/2 + \sum C_i \xi(w^Tx_i), where \xi() is the loss + +class l2r_erm_fun: public function +{ +public: + l2r_erm_fun(const problem *prob, const parameter *param, double *C); + ~l2r_erm_fun(); double fun(double *w); - void grad(double *w, double *g); - void Hv(double *s, double *Hs); - + double linesearch_and_update(double *w, double *d, double *f, double *g, double alpha); int get_nr_variable(void); -private: +protected: + virtual double C_times_loss(int i, double wx_i) = 0; void Xv(double *v, double *Xv); void XTv(double *v, double *XTv); double *C; - double *z; - double *D; const problem *prob; + double *wx; + double *tmp; // a working array + double wTw; + int regularize_bias; }; -l2r_lr_fun::l2r_lr_fun(const problem *prob, double Cp, double Cn) +l2r_erm_fun::l2r_erm_fun(const problem *prob, const parameter *param, double *C) { - int i; int l=prob->l; - int *y=prob->y; this->prob = prob; - z = new double[l]; - D = new double[l]; - C = new double[l]; - - for (i=0; iC = C; + this->regularize_bias = param->regularize_bias; } -l2r_lr_fun::~l2r_lr_fun() +l2r_erm_fun::~l2r_erm_fun() { - delete[] z; - delete[] D; - delete[] C; + delete[] wx; + delete[] tmp; } - -double l2r_lr_fun::fun(double *w) +double l2r_erm_fun::fun(double *w) { int i; double f=0; - int *y=prob->y; int l=prob->l; int w_size=get_nr_variable(); - Xv(w, z); - for(i=0;i= 0) - f += C[i]*log(1 + exp(-yz)); - else - f += C[i]*(-yz+log(1 + exp(yz))); - } - f = 2*f; - for(i=0;iy; - int l=prob->l; - int w_size=get_nr_variable(); - - for(i=0;in; } -void l2r_lr_fun::Hv(double *s, double *Hs) +// On entry *f must be the function value of w +// On exit w is updated and *f is the new function value +double l2r_erm_fun::linesearch_and_update(double *w, double *s, double *f, double *g, double alpha) { int i; - int l=prob->l; - int w_size=get_nr_variable(); - double *wa = new double[l]; + int l = prob->l; + double sTs = 0; + double wTs = 0; + double gTs = 0; + double eta = 0.01; + int w_size = get_nr_variable(); + int max_num_linesearch = 20; + double fold = *f; + Xv(s, tmp); - Xv(s, wa); - for(i=0;i= max_num_linesearch) + { + *f = fold; + return 0; + } + else + for (i=0;il; feature_node **x=prob->x; for(i=0;iindex!=-1) - { - Xv[i]+=v[s->index-1]*s->value; - s++; - } - } + Xv[i]=sparse_operator::dot(v, x[i]); } -void l2r_lr_fun::XTv(double *v, double *XTv) +void l2r_erm_fun::XTv(double *v, double *XTv) { int i; int l=prob->l; @@ -187,172 +252,226 @@ void l2r_lr_fun::XTv(double *v, double *XTv) for(i=0;il; + D = new double[l]; +} + +l2r_lr_fun::~l2r_lr_fun() +{ + delete[] D; +} + +double l2r_lr_fun::C_times_loss(int i, double wx_i) +{ + double ywx_i = wx_i * prob->y[i]; + if (ywx_i >= 0) + return C[i]*log(1 + exp(-ywx_i)); + else + return C[i]*(-ywx_i + log(1 + exp(ywx_i))); +} + +void l2r_lr_fun::grad(double *w, double *g) +{ + int i; + double *y=prob->y; + int l=prob->l; + int w_size=get_nr_variable(); + for(i=0;iindex!=-1) + tmp[i] = 1/(1 + exp(-y[i]*wx[i])); + D[i] = tmp[i]*(1-tmp[i]); + tmp[i] = C[i]*(tmp[i]-1)*y[i]; + } + XTv(tmp, g); + + for(i=0;il; + int w_size=get_nr_variable(); + feature_node **x = prob->x; + + for (i=0; iindex!=-1) { - XTv[s->index-1]+=v[i]*s->value; - s++; + M[xi->index-1] += xi->value*xi->value*C[i]*D[i]; + xi++; } } } -class l2r_l2_svc_fun : public function -{ -public: - l2r_l2_svc_fun(const problem *prob, double Cp, double Cn); - ~l2r_l2_svc_fun(); - - double fun(double *w); - void grad(double *w, double *g); - void Hv(double *s, double *Hs); - - int get_nr_variable(void); - -private: - void Xv(double *v, double *Xv); - void subXv(double *v, double *Xv); - void subXTv(double *v, double *XTv); - - double *C; - double *z; - double *D; - int *I; - int sizeI; - const problem *prob; -}; - -l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, double Cp, double Cn) +void l2r_lr_fun::Hv(double *s, double *Hs) { int i; int l=prob->l; - int *y=prob->y; + int w_size=get_nr_variable(); + feature_node **x=prob->x; - this->prob = prob; - - z = new double[l]; - D = new double[l]; - C = new double[l]; - I = new int[l]; - - for (i=0; il]; } l2r_l2_svc_fun::~l2r_l2_svc_fun() { - delete[] z; - delete[] D; - delete[] C; delete[] I; } -double l2r_l2_svc_fun::fun(double *w) +double l2r_l2_svc_fun::C_times_loss(int i, double wx_i) { - int i; - double f=0; - int *y=prob->y; - int l=prob->l; - int w_size=get_nr_variable(); - - Xv(w, z); - for(i=0;i 0) - f += C[i]*d*d; - } - f = 2*f; - for(i=0;iy[i] * wx_i; + if (d > 0) + return C[i] * d * d; + else + return 0; } void l2r_l2_svc_fun::grad(double *w, double *g) { int i; - int *y=prob->y; + double *y=prob->y; int l=prob->l; int w_size=get_nr_variable(); sizeI = 0; for (i=0;in; + int i; + int w_size=get_nr_variable(); + feature_node **x = prob->x; + + for (i=0; iindex!=-1) + { + M[xi->index-1] += xi->value*xi->value*C[idx]*2; + xi++; + } + } } void l2r_l2_svc_fun::Hv(double *s, double *Hs) { int i; - int l=prob->l; int w_size=get_nr_variable(); - double *wa = new double[l]; + feature_node **x=prob->x; - subXv(s, wa); + for(i=0;il; - feature_node **x=prob->x; - - for(i=0;iindex!=-1) - { - Xv[i]+=v[s->index-1]*s->value; - s++; - } - } -} - -void l2r_l2_svc_fun::subXv(double *v, double *Xv) -{ - int i; - feature_node **x=prob->x; - - for(i=0;iindex!=-1) - { - Xv[i]+=v[s->index-1]*s->value; - s++; - } - } + if(regularize_bias == 0) + Hs[w_size-1] -= s[w_size-1]; } void l2r_l2_svc_fun::subXTv(double *v, double *XTv) @@ -364,29 +483,87 @@ void l2r_l2_svc_fun::subXTv(double *v, double *XTv) for(i=0;iindex!=-1) - { - XTv[s->index-1]+=v[i]*s->value; - s++; - } - } + sparse_operator::axpy(v[i], x[I[i]], XTv); } -// A coordinate descent algorithm for +class l2r_l2_svr_fun: public l2r_l2_svc_fun +{ +public: + l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C); + + void grad(double *w, double *g); + +private: + double C_times_loss(int i, double wx_i); + double p; +}; + +l2r_l2_svr_fun::l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C): + l2r_l2_svc_fun(prob, param, C) +{ + this->p = param->p; + this->regularize_bias = param->regularize_bias; +} + +double l2r_l2_svr_fun::C_times_loss(int i, double wx_i) +{ + double d = wx_i - prob->y[i]; + if(d < -p) + return C[i]*(d+p)*(d+p); + else if(d > p) + return C[i]*(d-p)*(d-p); + return 0; +} + +void l2r_l2_svr_fun::grad(double *w, double *g) +{ + int i; + double *y=prob->y; + int l=prob->l; + int w_size=get_nr_variable(); + double d; + + sizeI = 0; + for(i=0;i p) + { + tmp[sizeI] = C[i]*(d-p); + I[sizeI] = i; + sizeI++; + } + + } + subXTv(tmp, g); + + for(i=0;iy[i]) +#define GETI(i) ((int) prob->y[i]) // To support weights for instances, use GETI(i) (i) class Solver_MCSVM_CS @@ -455,8 +632,8 @@ void Solver_MCSVM_CS::solve_sub_problem(double A_i, int yi, double C_yi, int act double beta = D[0] - A_i*C_yi; for(r=1;ry[i] == m + // alpha[i*nr_class+m] <= 0 if prob->y[i] != m + // If initial alpha isn't zero, uncomment the for loop below to initialize w for(i=0;iindex != -1) { - QD[i] += (xi->value)*(xi->value); + double val = xi->value; + QD[i] += val*val; + + // Uncomment the for loop if initial alpha isn't zero + // for(m=0; mindex-1)*nr_class+m] += alpha[i*nr_class+m]*val; xi++; } active_size_i[i] = nr_class; - y_index[i] = prob->y[i]; + y_index[i] = (int)prob->y[i]; index[i] = i; } - while(iter < max_iter) + while(iter < max_iter) { double stopping = -INF; for(i=0;iy[i]] < C[GETI(i)] && G[y_index[i]] < minG) + if(alpha_i[(int) prob->y[i]] < C[GETI(i)] && G[y_index[i]] < minG) minG = G[y_index[i]]; for(m=0;mm) { - if(!be_shrunk(i, active_size_i[i], y_index[i], + if(!be_shrunk(i, active_size_i[i], y_index[i], alpha_i[alpha_index_i[active_size_i[i]]], minG)) { swap(alpha_index_i[m], alpha_index_i[active_size_i[i]]); swap(G[m], G[active_size_i[i]]); if(y_index[i] == active_size_i[i]) y_index[i] = m; - else if(y_index[i] == m) + else if(y_index[i] == m) y_index[i] = active_size_i[i]; break; } @@ -585,7 +773,7 @@ void Solver_MCSVM_CS::Solve(double *w) { active_size--; swap(index[s], index[active_size]); - s--; + s--; continue; } @@ -663,7 +851,7 @@ void Solver_MCSVM_CS::Solve(double *w) nSV++; } for(i=0;iy[i]]; + v -= alpha[i*nr_class+(int)prob->y[i]]; info("Objective value = %lf\n",v); info("nSV = %d\n",nSV); @@ -678,46 +866,47 @@ void Solver_MCSVM_CS::Solve(double *w) delete [] active_size_i; } -// A coordinate descent algorithm for +// A coordinate descent algorithm for // L1-loss and L2-loss SVM dual problems // // min_\alpha 0.5(\alpha^T (Q + D)\alpha) - e^T \alpha, -// s.t. 0 <= alpha_i <= upper_bound_i, -// +// s.t. 0 <= \alpha_i <= upper_bound_i, +// // where Qij = yi yj xi^T xj and -// D is a diagonal matrix +// D is a diagonal matrix // // In L1-SVM case: -// upper_bound_i = Cp if y_i = 1 -// upper_bound_i = Cn if y_i = -1 -// D_ii = 0 +// upper_bound_i = Cp if y_i = 1 +// upper_bound_i = Cn if y_i = -1 +// D_ii = 0 // In L2-SVM case: -// upper_bound_i = INF -// D_ii = 1/(2*Cp) if y_i = 1 -// D_ii = 1/(2*Cn) if y_i = -1 +// upper_bound_i = INF +// D_ii = 1/(2*Cp) if y_i = 1 +// D_ii = 1/(2*Cn) if y_i = -1 // -// Given: +// Given: // x, y, Cp, Cn // eps is the stopping tolerance // // solution will be put in w -// +// +// this function returns the number of iterations +// // See Algorithm 3 of Hsieh et al., ICML 2008 #undef GETI #define GETI(i) (y[i]+1) // To support weights for instances, use GETI(i) (i) -static void solve_l2r_l1l2_svc( - const problem *prob, double *w, double eps, - double Cp, double Cn, int solver_type) +static int solve_l2r_l1l2_svc(const problem *prob, const parameter *param, double *w, double Cp, double Cn, int max_iter=300) { int l = prob->l; int w_size = prob->n; + double eps = param->eps; + int solver_type = param->solver_type; int i, s, iter = 0; double C, d, G; double *QD = new double[l]; - int max_iter = 1000; int *index = new int[l]; double *alpha = new double[l]; schar *y = new schar[l]; @@ -740,27 +929,33 @@ static void solve_l2r_l1l2_svc( upper_bound[2] = Cp; } - for(i=0; iy[i] > 0) { - y[i] = +1; + y[i] = +1; } else { y[i] = -1; } + } + + // Initial alpha can be set here. Note that + // 0 <= alpha[i] <= upper_bound[GETI(i)] + for(i=0; ix[i]; - while (xi->index != -1) - { - QD[i] += (xi->value)*(xi->value); - xi++; - } + feature_node * const xi = prob->x[i]; + QD[i] += sparse_operator::nrm2_sq(xi); + sparse_operator::axpy(y[i]*alpha[i], xi, w); + index[i] = i; } @@ -778,16 +973,10 @@ static void solve_l2r_l1l2_svc( for (s=0; sx[i]; - feature_node *xi = prob->x[i]; - while(xi->index!= -1) - { - G += w[xi->index-1]*(xi->value); - xi++; - } - G = G*yi-1; + G = yi*sparse_operator::dot(w, xi)-1; C = upper_bound[GETI(i)]; G += alpha[i]*diag[GETI(i)]; @@ -828,12 +1017,7 @@ static void solve_l2r_l1l2_svc( double alpha_old = alpha[i]; alpha[i] = min(max(alpha[i] - G/QD[i], 0.0), C); d = (alpha[i] - alpha_old)*yi; - xi = prob->x[i]; - while (xi->index != -1) - { - w[xi->index-1] += d*xi->value; - xi++; - } + sparse_operator::axpy(d, xi, w); } } @@ -841,7 +1025,8 @@ static void solve_l2r_l1l2_svc( if(iter % 10 == 0) info("."); - if(PGmax_new - PGmin_new <= eps) + if(PGmax_new - PGmin_new <= eps && + fabs(PGmax_new) <= eps && fabs(PGmin_new) <= eps) { if(active_size == l) break; @@ -863,8 +1048,6 @@ static void solve_l2r_l1l2_svc( } info("\noptimization finished, #iter = %d\n",iter); - if (iter >= max_iter) - info("\nWARNING: reaching max number of iterations\nUsing -s 2 may be faster (also see FAQ)\n\n"); // calculate objective value @@ -885,68 +1068,292 @@ static void solve_l2r_l1l2_svc( delete [] alpha; delete [] y; delete [] index; + + return iter; } -// A coordinate descent algorithm for + +// A coordinate descent algorithm for +// L1-loss and L2-loss epsilon-SVR dual problem +// +// min_\beta 0.5\beta^T (Q + diag(lambda)) \beta - p \sum_{i=1}^l|\beta_i| + \sum_{i=1}^l yi\beta_i, +// s.t. -upper_bound_i <= \beta_i <= upper_bound_i, +// +// where Qij = xi^T xj and +// D is a diagonal matrix +// +// In L1-SVM case: +// upper_bound_i = C +// lambda_i = 0 +// In L2-SVM case: +// upper_bound_i = INF +// lambda_i = 1/(2*C) +// +// Given: +// x, y, p, C +// eps is the stopping tolerance +// +// solution will be put in w +// +// this function returns the number of iterations +// +// See Algorithm 4 of Ho and Lin, 2012 + +#undef GETI +#define GETI(i) (0) +// To support weights for instances, use GETI(i) (i) + +static int solve_l2r_l1l2_svr(const problem *prob, const parameter *param, double *w, int max_iter=300) +{ + const int solver_type = param->solver_type; + int l = prob->l; + double C = param->C; + double p = param->p; + int w_size = prob->n; + double eps = param->eps; + int i, s, iter = 0; + int active_size = l; + int *index = new int[l]; + + double d, G, H; + double Gmax_old = INF; + double Gmax_new, Gnorm1_new; + double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration + double *beta = new double[l]; + double *QD = new double[l]; + double *y = prob->y; + + // L2R_L2LOSS_SVR_DUAL + double lambda[1], upper_bound[1]; + lambda[0] = 0.5/C; + upper_bound[0] = INF; + + if(solver_type == L2R_L1LOSS_SVR_DUAL) + { + lambda[0] = 0; + upper_bound[0] = C; + } + + // Initial beta can be set here. Note that + // -upper_bound <= beta[i] <= upper_bound + for(i=0; ix[i]; + QD[i] = sparse_operator::nrm2_sq(xi); + sparse_operator::axpy(beta[i], xi, w); + + index[i] = i; + } + + + while(iter < max_iter) + { + Gmax_new = 0; + Gnorm1_new = 0; + + for(i=0; ix[i]; + G += sparse_operator::dot(w, xi); + + double Gp = G+p; + double Gn = G-p; + double violation = 0; + if(beta[i] == 0) + { + if(Gp < 0) + violation = -Gp; + else if(Gn > 0) + violation = Gn; + else if(Gp>Gmax_old && Gn<-Gmax_old) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(beta[i] >= upper_bound[GETI(i)]) + { + if(Gp > 0) + violation = Gp; + else if(Gp < -Gmax_old) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(beta[i] <= -upper_bound[GETI(i)]) + { + if(Gn < 0) + violation = -Gn; + else if(Gn > Gmax_old) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(beta[i] > 0) + violation = fabs(Gp); + else + violation = fabs(Gn); + + Gmax_new = max(Gmax_new, violation); + Gnorm1_new += violation; + + // obtain Newton direction d + if(Gp < H*beta[i]) + d = -Gp/H; + else if(Gn > H*beta[i]) + d = -Gn/H; + else + d = -beta[i]; + + if(fabs(d) < 1.0e-12) + continue; + + double beta_old = beta[i]; + beta[i] = min(max(beta[i]+d, -upper_bound[GETI(i)]), upper_bound[GETI(i)]); + d = beta[i]-beta_old; + + if(d != 0) + sparse_operator::axpy(d, xi, w); + } + + if(iter == 0) + Gnorm1_init = Gnorm1_new; + iter++; + if(iter % 10 == 0) + info("."); + + if(Gnorm1_new <= eps*Gnorm1_init) + { + if(active_size == l) + break; + else + { + active_size = l; + info("*"); + Gmax_old = INF; + continue; + } + } + + Gmax_old = Gmax_new; + } + + info("\noptimization finished, #iter = %d\n", iter); + + // calculate objective value + double v = 0; + int nSV = 0; + for(i=0; il; int w_size = prob->n; + double eps = param->eps; int i, s, iter = 0; double *xTx = new double[l]; - int max_iter = 1000; - int *index = new int[l]; + int *index = new int[l]; double *alpha = new double[2*l]; // store alpha and C - alpha - schar *y = new schar[l]; + schar *y = new schar[l]; int max_inner_iter = 100; // for inner Newton - double innereps = 1e-2; + double innereps = 1e-2; double innereps_min = min(1e-8, eps); double upper_bound[3] = {Cn, 0, Cp}; - for(i=0; iy[i] > 0) { - y[i] = +1; + y[i] = +1; } else { y[i] = -1; } + } + + // Initial alpha can be set here. Note that + // 0 < alpha[i] < upper_bound[GETI(i)] + // alpha[2*i] + alpha[2*i+1] = upper_bound[GETI(i)] + for(i=0; ix[i]; - while (xi->index != -1) - { - xTx[i] += (xi->value)*(xi->value); - w[xi->index-1] += y[i]*alpha[2*i]*xi->value; - xi++; - } + for(i=0; ix[i]; + xTx[i] = sparse_operator::nrm2_sq(xi); + sparse_operator::axpy(y[i]*alpha[2*i], xi, w); index[i] = i; } @@ -962,21 +1369,16 @@ void solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, do for (s=0; sx[i]; - while (xi->index != -1) - { - ywTx += w[xi->index-1]*xi->value; - xi++; - } - ywTx *= y[i]; + feature_node * const xi = prob->x[i]; + ywTx = yi*sparse_operator::dot(w, xi); double a = xisq, b = ywTx; // Decide to minimize g_1(z) or g_2(z) int ind1 = 2*i, ind2 = 2*i+1, sign = 1; - if(0.5*a*(alpha[ind2]-alpha[ind1])+b < 0) + if(0.5*a*(alpha[ind2]-alpha[ind1])+b < 0) { ind1 = 2*i+1; ind2 = 2*i; @@ -986,7 +1388,7 @@ void solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, do // g_t(z) = z*log(z) + (C-z)*log(C-z) + 0.5a(z-alpha_old)^2 + sign*b(z-alpha_old) double alpha_old = alpha[ind1]; double z = alpha_old; - if(C - z < 0.5 * C) + if(C - z < 0.5 * C) z = 0.1*z; double gp = a*(z-alpha_old)+sign*b+log(z/(C-z)); Gmax = max(Gmax, fabs(gp)); @@ -994,13 +1396,13 @@ void solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, do // Newton method on the sub-problem const double eta = 0.1; // xi in the paper int inner_iter = 0; - while (inner_iter <= max_inner_iter) + while (inner_iter <= max_inner_iter) { if(fabs(gp) < innereps) break; double gpp = a + C/(C-z)/z; double tmpz = z - gp/gpp; - if(tmpz <= 0) + if(tmpz <= 0) z *= eta; else // tmpz in (0, C) z = tmpz; @@ -1013,12 +1415,7 @@ void solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, do { alpha[ind1] = z; alpha[ind2] = C-z; - xi = prob->x[i]; - while (xi->index != -1) - { - w[xi->index-1] += sign*(z-alpha_old)*yi*xi->value; - xi++; - } + sparse_operator::axpy(sign*(z-alpha_old)*yi, xi, w); } } @@ -1026,26 +1423,24 @@ void solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, do if(iter % 10 == 0) info("."); - if(Gmax < eps) + if(Gmax < eps) break; - if(newton_iter <= l/10) + if(newton_iter <= l/10) innereps = max(innereps_min, 0.1*innereps); } info("\noptimization finished, #iter = %d\n",iter); - if (iter >= max_iter) - info("\nWARNING: reaching max number of iterations\nUsing -s 0 may be faster (also see FAQ)\n\n"); // calculate objective value - + double v = 0; for(i=0; il; int w_size = prob_col->n; + int regularize_bias = param->regularize_bias; int j, s, iter = 0; int max_iter = 1000; int active_size = w_size; @@ -1087,9 +1488,9 @@ static void solve_l1r_l2_svc( double d, G_loss, G, H; double Gmax_old = INF; double Gmax_new, Gnorm1_new; - double Gnorm1_init; + double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration double d_old, d_diff; - double loss_old, loss_new; + double loss_old = 0, loss_new; double appxcond, cond; int *index = new int[w_size]; @@ -1100,6 +1501,10 @@ static void solve_l1r_l2_svc( double C[3] = {Cn,0,Cp}; + // Initial w can be set here. + for(j=0; jx[j]; while(x->index != -1) { int ind = x->index-1; - double val = x->value; x->value *= y[ind]; // x->value stores yi*xij + double val = x->value; + b[ind] -= w[j]*val; xj_sq[j] += C[GETI(ind)]*val*val; x++; } @@ -1160,59 +1565,72 @@ static void solve_l1r_l2_svc( H *= 2; H = max(H, 1e-12); - double Gp = G+1; - double Gn = G-1; double violation = 0; - if(w[j] == 0) - { - if(Gp < 0) - violation = -Gp; - else if(Gn > 0) - violation = Gn; - else if(Gp>Gmax_old/l && Gn<-Gmax_old/l) - { - active_size--; - swap(index[s], index[active_size]); - s--; - continue; - } - } - else if(w[j] > 0) - violation = fabs(Gp); + double Gp = 0, Gn = 0; + if(j == w_size-1 && regularize_bias == 0) + violation = fabs(G); else - violation = fabs(Gn); - + { + Gp = G+1; + Gn = G-1; + if(w[j] == 0) + { + if(Gp < 0) + violation = -Gp; + else if(Gn > 0) + violation = Gn; + else if(Gp>Gmax_old/l && Gn<-Gmax_old/l) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(w[j] > 0) + violation = fabs(Gp); + else + violation = fabs(Gn); + } Gmax_new = max(Gmax_new, violation); Gnorm1_new += violation; // obtain Newton direction d - if(Gp <= H*w[j]) - d = -Gp/H; - else if(Gn >= H*w[j]) - d = -Gn/H; + if(j == w_size-1 && regularize_bias == 0) + d = -G/H; else - d = -w[j]; + { + if(Gp < H*w[j]) + d = -Gp/H; + else if(Gn > H*w[j]) + d = -Gn/H; + else + d = -w[j]; + } if(fabs(d) < 1.0e-12) continue; - double delta = fabs(w[j]+d)-fabs(w[j]) + G*d; + double delta; + if(j == w_size-1 && regularize_bias == 0) + delta = G*d; + else + delta = fabs(w[j]+d)-fabs(w[j]) + G*d; d_old = 0; int num_linesearch; for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++) { d_diff = d_old - d; - cond = fabs(w[j]+d)-fabs(w[j]) - sigma*delta; + if(j == w_size-1 && regularize_bias == 0) + cond = -sigma*delta; + else + cond = fabs(w[j]+d)-fabs(w[j]) - sigma*delta; appxcond = xj_sq[j]*d*d + G_loss*d + cond; if(appxcond <= 0) { x = prob_col->x[j]; - while(x->index != -1) - { - b[x->index-1] += d_diff*x->value; - x++; - } + sparse_operator::axpy(d_diff, x, b); break; } @@ -1272,11 +1690,7 @@ static void solve_l1r_l2_svc( { if(w[i]==0) continue; x = prob_col->x[i]; - while(x->index != -1) - { - b[x->index-1] -= w[i]*x->value; - x++; - } + sparse_operator::axpy(-w[i], x, b); } } } @@ -1325,6 +1739,8 @@ static void solve_l1r_l2_svc( nnz++; } } + if (regularize_bias == 0) + v -= fabs(w[w_size-1]); for(j=0; j 0) v += C[GETI(j)]*b[j]*b[j]; @@ -1336,31 +1752,37 @@ static void solve_l1r_l2_svc( delete [] y; delete [] b; delete [] xj_sq; + + return iter; } -// A coordinate descent algorithm for +// A coordinate descent algorithm for // L1-regularized logistic regression problems // // min_w \sum |wj| + C \sum log(1+exp(-yi w^T xi)), // -// Given: +// Given: // x, y, Cp, Cn // eps is the stopping tolerance // // solution will be put in w // +// this function returns the number of iterations +// // See Yuan et al. (2011) and appendix of LIBLINEAR paper, Fan et al. (2008) +// +// To not regularize the bias (i.e., regularize_bias = 0), a constant feature = 1 +// must have been added to the original data. (see -B and -R option) #undef GETI #define GETI(i) (y[i]+1) // To support weights for instances, use GETI(i) (i) -static void solve_l1r_lr( - const problem *prob_col, double *w, double eps, - double Cp, double Cn) +static int solve_l1r_lr(const problem *prob_col, const parameter *param, double *w, double Cp, double Cn, double eps) { int l = prob_col->l; int w_size = prob_col->n; + int regularize_bias = param->regularize_bias; int j, s, newton_iter=0, iter=0; int max_newton_iter = 100; int max_iter = 1000; @@ -1371,9 +1793,9 @@ static void solve_l1r_lr( double nu = 1e-12; double inner_eps = 1; double sigma = 0.01; - double w_norm=0, w_norm_new; + double w_norm, w_norm_new; double z, G, H; - double Gnorm1_init; + double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration double Gmax_old = INF; double Gmax_new, Gnorm1_new; double QP_Gmax_old = INF; @@ -1395,6 +1817,10 @@ static void solve_l1r_lr( double C[3] = {Cn,0,Cp}; + // Initial w can be set here. + for(j=0; jy[j] > 0) @@ -1402,14 +1828,13 @@ static void solve_l1r_lr( else y[j] = -1; - // assume initial w is 0 - exp_wTx[j] = 1; - tau[j] = C[GETI(j)]*0.5; - D[j] = C[GETI(j)]*0.25; + exp_wTx[j] = 0; } + + w_norm = 0; for(j=0; jindex != -1) { int ind = x->index-1; + double val = x->value; + exp_wTx[ind] += w[j]*val; if(y[ind] == -1) - xjneg_sum[j] += C[GETI(ind)]*x->value; + xjneg_sum[j] += C[GETI(ind)]*val; x++; } } + if (regularize_bias == 0) + w_norm -= fabs(w[w_size-1]); + + for(j=0; j 0) - violation = Gn; - //outer-level shrinking - else if(Gp>Gmax_old/l && Gn<-Gmax_old/l) - { - active_size--; - swap(index[s], index[active_size]); - s--; - continue; - } - } - else if(w[j] > 0) - violation = fabs(Gp); + if (j == w_size-1 && regularize_bias == 0) + violation = fabs(Grad[j]); else - violation = fabs(Gn); - + { + double Gp = Grad[j]+1; + double Gn = Grad[j]-1; + if(w[j] == 0) + { + if(Gp < 0) + violation = -Gp; + else if(Gn > 0) + violation = Gn; + //outer-level shrinking + else if(Gp>Gmax_old/l && Gn<-Gmax_old/l) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(w[j] > 0) + violation = fabs(Gp); + else + violation = fabs(Gn); + } Gmax_new = max(Gmax_new, violation); Gnorm1_new += violation; } @@ -1512,40 +1953,48 @@ static void solve_l1r_lr( x++; } - double Gp = G+1; - double Gn = G-1; double violation = 0; - if(wpd[j] == 0) + if (j == w_size-1 && regularize_bias == 0) { - if(Gp < 0) - violation = -Gp; - else if(Gn > 0) - violation = Gn; - //inner-level shrinking - else if(Gp>QP_Gmax_old/l && Gn<-QP_Gmax_old/l) - { - QP_active_size--; - swap(index[s], index[QP_active_size]); - s--; - continue; - } + // bias term not shrunken + violation = fabs(G); + z = -G/H; } - else if(wpd[j] > 0) - violation = fabs(Gp); else - violation = fabs(Gn); + { + double Gp = G+1; + double Gn = G-1; + if(wpd[j] == 0) + { + if(Gp < 0) + violation = -Gp; + else if(Gn > 0) + violation = Gn; + //inner-level shrinking + else if(Gp>QP_Gmax_old/l && Gn<-QP_Gmax_old/l) + { + QP_active_size--; + swap(index[s], index[QP_active_size]); + s--; + continue; + } + } + else if(wpd[j] > 0) + violation = fabs(Gp); + else + violation = fabs(Gn); + // obtain solution of one-variable problem + if(Gp < H*wpd[j]) + z = -Gp/H; + else if(Gn > H*wpd[j]) + z = -Gn/H; + else + z = -wpd[j]; + } QP_Gmax_new = max(QP_Gmax_new, violation); QP_Gnorm1_new += violation; - // obtain solution of one-variable problem - if(Gp <= H*wpd[j]) - z = -Gp/H; - else if(Gn >= H*wpd[j]) - z = -Gn/H; - else - z = -wpd[j]; - if(fabs(z) < 1.0e-12) continue; z = min(max(z,-10.0),10.0); @@ -1553,12 +2002,7 @@ static void solve_l1r_lr( wpd[j] += z; x = prob_col->x[j]; - while(x->index != -1) - { - int ind = x->index-1; - xTd[ind] += x->value*z; - x++; - } + sparse_operator::axpy(z, x, xTd); } iter++; @@ -1591,6 +2035,8 @@ static void solve_l1r_lr( if(wpd[j] != 0) w_norm_new += fabs(wpd[j]); } + if (regularize_bias == 0) + w_norm_new -= fabs(wpd[w_size-1]); delta += (w_norm_new-w_norm); negsum_xTd = 0; @@ -1633,6 +2079,8 @@ static void solve_l1r_lr( if(wpd[j] != 0) w_norm_new += fabs(wpd[j]); } + if (regularize_bias == 0) + w_norm_new -= fabs(wpd[w_size-1]); delta *= 0.5; negsum_xTd *= 0.5; for(int i=0; ix[i]; - while(x->index != -1) - { - exp_wTx[x->index-1] += w[i]*x->value; - x++; - } + sparse_operator::axpy(w[i], x, exp_wTx); } for(int i=0; i= pivot +static int partition(feature_node *nodes, int low, int high) +{ + int i; + int index; + + swap(nodes[low + rand()%(high-low+1)], nodes[high]); // select and move pivot to the end + + index = low; + for(i = low; i < high; i++) + if (compare_feature_node(&nodes[i], &nodes[high]) == -1) + { + swap(nodes[index], nodes[i]); + index++; + } + + swap(nodes[high], nodes[index]); + return index; +} + +// rearrange nodes so that nodes[:k] contains nodes with the k smallest values. +static void quick_select_min_k(feature_node *nodes, int low, int high, int k) +{ + int pivot; + if(low == high) + return; + pivot = partition(nodes, low, high); + if(pivot == k) + return; + else if(k-1 < pivot) + return quick_select_min_k(nodes, low, pivot-1, k); + else + return quick_select_min_k(nodes, pivot+1, high, k); +} + +// A two-level coordinate descent algorithm for +// a scaled one-class SVM dual problem +// +// min_\alpha 0.5(\alpha^T Q \alpha), +// s.t. 0 <= \alpha_i <= 1 and +// e^T \alpha = \nu l +// +// where Qij = xi^T xj +// +// Given: +// x, nu +// eps is the stopping tolerance +// +// solution will be put in w and rho +// +// this function returns the number of iterations +// +// See Algorithm 7 in supplementary materials of Chou et al., SDM 2020. + +static int solve_oneclass_svm(const problem *prob, const parameter *param, double *w, double *rho) +{ + int l = prob->l; + int w_size = prob->n; + double eps = param->eps; + double nu = param->nu; + int i, j, s, iter = 0; + double Gi, Gj; + double Qij, quad_coef, delta, sum; + double old_alpha_i; + double *QD = new double[l]; + double *G = new double[l]; + int *index = new int[l]; + double *alpha = new double[l]; + int max_inner_iter; + int max_iter = 1000; + int active_size = l; + + double negGmax; // max { -grad(f)_i | i in Iup } + double negGmin; // min { -grad(f)_i | i in Ilow } + // Iup = { i | alpha_i < 1 }, Ilow = { i | alpha_i > 0 } + feature_node *max_negG_of_Iup = new feature_node[l]; + feature_node *min_negG_of_Ilow = new feature_node[l]; + feature_node node; + + int n = (int)(nu*l); // # of alpha's at upper bound + for(i=0; ix[i]; + QD[i] = sparse_operator::nrm2_sq(xi); + sparse_operator::axpy(alpha[i], xi, w); + + index[i] = i; + } + + while (iter < max_iter) + { + negGmax = -INF; + negGmin = INF; + + for (s=0; sx[i]; + G[i] = sparse_operator::dot(w, xi); + if (alpha[i] < 1) + negGmax = max(negGmax, -G[i]); + if (alpha[i] > 0) + negGmin = min(negGmin, -G[i]); + } + + if (negGmax - negGmin < eps) + { + if (active_size == l) + break; + else + { + active_size = l; + info("*"); + continue; + } + } + + for(s=0; s negGmax) || + (alpha[i] == 0 && -G[i] < negGmin)) + { + active_size--; + swap(index[s], index[active_size]); + s--; + } + } + + max_inner_iter = max(active_size/10, 1); + int len_Iup = 0; + int len_Ilow = 0; + for(s=0; s 0) + { + min_negG_of_Ilow[len_Ilow] = node; + len_Ilow++; + } + } + max_inner_iter = min(max_inner_iter, min(len_Iup, len_Ilow)); + + quick_select_min_k(max_negG_of_Iup, 0, len_Iup-1, len_Iup-max_inner_iter); + qsort(&(max_negG_of_Iup[len_Iup-max_inner_iter]), max_inner_iter, sizeof(struct feature_node), compare_feature_node); + + quick_select_min_k(min_negG_of_Ilow, 0, len_Ilow-1, max_inner_iter); + qsort(min_negG_of_Ilow, max_inner_iter, sizeof(struct feature_node), compare_feature_node); + + for (s=0; sx[i]; + feature_node const * xj = prob->x[j]; + + Gi = sparse_operator::dot(w, xi); + Gj = sparse_operator::dot(w, xj); + + int violating_pair = 0; + if (alpha[i] < 1 && alpha[j] > 0 && -Gj + 1e-12 < -Gi) + violating_pair = 1; + else + if (alpha[i] > 0 && alpha[j] < 1 && -Gi + 1e-12 < -Gj) + violating_pair = 1; + if (violating_pair == 0) + continue; + + Qij = sparse_operator::sparse_dot(xi, xj); + quad_coef = QD[i] + QD[j] - 2*Qij; + if(quad_coef <= 0) + quad_coef = 1e-12; + delta = (Gi - Gj) / quad_coef; + old_alpha_i = alpha[i]; + sum = alpha[i] + alpha[j]; + alpha[i] = alpha[i] - delta; + alpha[j] = alpha[j] + delta; + if (sum > 1) + { + if (alpha[i] > 1) + { + alpha[i] = 1; + alpha[j] = sum - 1; + } + } + else + { + if (alpha[j] < 0) + { + alpha[j] = 0; + alpha[i] = sum; + } + } + if (sum > 1) + { + if (alpha[j] > 1) + { + alpha[j] = 1; + alpha[i] = sum - 1; + } + } + else + { + if (alpha[i] < 0) + { + alpha[i] = 0; + alpha[j] = sum; + } + } + delta = alpha[i] - old_alpha_i; + sparse_operator::axpy(delta, xi, w); + sparse_operator::axpy(-delta, xj, w); + } + iter++; + if (iter % 10 == 0) + info("."); + } + info("\noptimization finished, #iter = %d\n",iter); + if (iter >= max_iter) + info("\nWARNING: reaching max number of iterations\n\n"); + + // calculate object value + double v = 0; + for(i=0; i 0) + ++nSV; + } + info("Objective value = %lf\n", v/2); + info("nSV = %d\n", nSV); + + // calculate rho + double nr_free = 0; + double ub = INF, lb = -INF, sum_free = 0; + for(i=0; ix[i]); + if (alpha[i] == 1) + lb = max(lb, G); + else if (alpha[i] == 0) + ub = min(ub, G); + else + { + ++nr_free; + sum_free += G; + } + } + + if (nr_free > 0) + *rho = sum_free/nr_free; + else + *rho = (ub + lb)/2; + info("rho = %lf\n", *rho); + + delete [] QD; + delete [] G; + delete [] index; + delete [] alpha; + delete [] max_negG_of_Iup; + delete [] min_negG_of_Ilow; + + return iter; } // transpose matrix X from row format to column format @@ -1713,12 +2470,12 @@ static void transpose(const problem *prob, feature_node **x_space_ret, problem * int i; int l = prob->l; int n = prob->n; - int nnz = 0; - int *col_ptr = new int[n+1]; + size_t nnz = 0; + size_t *col_ptr = new size_t [n+1]; feature_node *x_space; prob_col->l = l; prob_col->n = n; - prob_col->y = new int[l]; + prob_col->y = new double[l]; prob_col->x = new feature_node*[n]; for(i=0; iy[i]; + int this_label = (int)prob->y[i]; int j; for(j=0;jeps; - int pos = 0; - int neg = 0; - for(int i=0;il;i++) - if(prob->y[i]==+1) - pos++; - neg = prob->l - pos; + int solver_type = param->solver_type; + int dual_solver_max_iter = 300; + int iter; - function *fun_obj=NULL; - switch(param->solver_type) + bool is_regression = (solver_type==L2R_L2LOSS_SVR || + solver_type==L2R_L1LOSS_SVR_DUAL || + solver_type==L2R_L2LOSS_SVR_DUAL); + + // Some solvers use Cp,Cn but not C array; extensions possible but no plan for now + double *C = new double[prob->l]; + double primal_solver_tol = param->eps; + if(is_regression) + { + for(int i=0;il;i++) + C[i] = param->C; + } + else + { + int pos = 0; + for(int i=0;il;i++) + { + if(prob->y[i] > 0) + { + pos++; + C[i] = Cp; + } + else + C[i] = Cn; + } + int neg = prob->l - pos; + primal_solver_tol = param->eps*max(min(pos,neg), 1)/prob->l; + } + + switch(solver_type) { case L2R_LR: { - fun_obj=new l2r_lr_fun(prob, Cp, Cn); - TRON tron_obj(fun_obj, eps*min(pos,neg)/prob->l); - tron_obj.set_print_string(liblinear_print_string); - tron_obj.tron(w); - delete fun_obj; + l2r_lr_fun fun_obj(prob, param, C); + NEWTON newton_obj(&fun_obj, primal_solver_tol); + newton_obj.set_print_string(liblinear_print_string); + newton_obj.newton(w); break; } case L2R_L2LOSS_SVC: { - fun_obj=new l2r_l2_svc_fun(prob, Cp, Cn); - TRON tron_obj(fun_obj, eps*min(pos,neg)/prob->l); - tron_obj.set_print_string(liblinear_print_string); - tron_obj.tron(w); - delete fun_obj; + l2r_l2_svc_fun fun_obj(prob, param, C); + NEWTON newton_obj(&fun_obj, primal_solver_tol); + newton_obj.set_print_string(liblinear_print_string); + newton_obj.newton(w); break; } case L2R_L2LOSS_SVC_DUAL: - solve_l2r_l1l2_svc(prob, w, eps, Cp, Cn, L2R_L2LOSS_SVC_DUAL); + { + iter = solve_l2r_l1l2_svc(prob, param, w, Cp, Cn, dual_solver_max_iter); + if(iter >= dual_solver_max_iter) + { + info("\nWARNING: reaching max number of iterations\nSwitching to use -s 2\n\n"); + // primal_solver_tol obtained from eps for dual may be too loose + primal_solver_tol *= 0.1; + l2r_l2_svc_fun fun_obj(prob, param, C); + NEWTON newton_obj(&fun_obj, primal_solver_tol); + newton_obj.set_print_string(liblinear_print_string); + newton_obj.newton(w); + } break; + } case L2R_L1LOSS_SVC_DUAL: - solve_l2r_l1l2_svc(prob, w, eps, Cp, Cn, L2R_L1LOSS_SVC_DUAL); + { + iter = solve_l2r_l1l2_svc(prob, param, w, Cp, Cn, dual_solver_max_iter); + if(iter >= dual_solver_max_iter) + info("\nWARNING: reaching max number of iterations\nUsing -s 2 may be faster (also see FAQ)\n\n"); break; + } case L1R_L2LOSS_SVC: { problem prob_col; feature_node *x_space = NULL; transpose(prob, &x_space ,&prob_col); - solve_l1r_l2_svc(&prob_col, w, eps*min(pos,neg)/prob->l, Cp, Cn); + solve_l1r_l2_svc(&prob_col, param, w, Cp, Cn, primal_solver_tol); delete [] prob_col.y; delete [] prob_col.x; delete [] x_space; @@ -1875,21 +2688,243 @@ static void train_one(const problem *prob, const parameter *param, double *w, do problem prob_col; feature_node *x_space = NULL; transpose(prob, &x_space ,&prob_col); - solve_l1r_lr(&prob_col, w, eps*min(pos,neg)/prob->l, Cp, Cn); + solve_l1r_lr(&prob_col, param, w, Cp, Cn, primal_solver_tol); delete [] prob_col.y; delete [] prob_col.x; delete [] x_space; break; } case L2R_LR_DUAL: - solve_l2r_lr_dual(prob, w, eps, Cp, Cn); + { + iter = solve_l2r_lr_dual(prob, param, w, Cp, Cn, dual_solver_max_iter); + if(iter >= dual_solver_max_iter) + { + info("\nWARNING: reaching max number of iterations\nSwitching to use -s 0\n\n"); + // primal_solver_tol obtained from eps for dual may be too loose + primal_solver_tol *= 0.1; + l2r_lr_fun fun_obj(prob, param, C); + NEWTON newton_obj(&fun_obj, primal_solver_tol); + newton_obj.set_print_string(liblinear_print_string); + newton_obj.newton(w); + } break; + } + case L2R_L2LOSS_SVR: + { + l2r_l2_svr_fun fun_obj(prob, param, C); + NEWTON newton_obj(&fun_obj, primal_solver_tol); + newton_obj.set_print_string(liblinear_print_string); + newton_obj.newton(w); + break; + } + case L2R_L1LOSS_SVR_DUAL: + { + iter = solve_l2r_l1l2_svr(prob, param, w, dual_solver_max_iter); + if(iter >= dual_solver_max_iter) + info("\nWARNING: reaching max number of iterations\nUsing -s 11 may be faster (also see FAQ)\n\n"); + + break; + } + case L2R_L2LOSS_SVR_DUAL: + { + iter = solve_l2r_l1l2_svr(prob, param, w, dual_solver_max_iter); + if(iter >= dual_solver_max_iter) + { + info("\nWARNING: reaching max number of iterations\nSwitching to use -s 11\n\n"); + // primal_solver_tol obtained from eps for dual may be too loose + primal_solver_tol *= 0.001; + l2r_l2_svr_fun fun_obj(prob, param, C); + NEWTON newton_obj(&fun_obj, primal_solver_tol); + newton_obj.set_print_string(liblinear_print_string); + newton_obj.newton(w); + } + break; + } default: - fprintf(stderr, "Error: unknown solver_type\n"); + fprintf(stderr, "ERROR: unknown solver_type\n"); break; } + + delete[] C; } +// Calculate the initial C for parameter selection +static double calc_start_C(const problem *prob, const parameter *param) +{ + int i; + double xTx, max_xTx; + max_xTx = 0; + for(i=0; il; i++) + { + xTx = 0; + feature_node *xi=prob->x[i]; + while(xi->index != -1) + { + double val = xi->value; + xTx += val*val; + xi++; + } + if(xTx > max_xTx) + max_xTx = xTx; + } + + double min_C = 1.0; + if(param->solver_type == L2R_LR) + min_C = 1.0 / (prob->l * max_xTx); + else if(param->solver_type == L2R_L2LOSS_SVC) + min_C = 1.0 / (2 * prob->l * max_xTx); + else if(param->solver_type == L2R_L2LOSS_SVR) + { + double sum_y, loss, y_abs; + double delta2 = 0.1; + sum_y = 0, loss = 0; + for(i=0; il; i++) + { + y_abs = fabs(prob->y[i]); + sum_y += y_abs; + loss += max(y_abs - param->p, 0.0) * max(y_abs - param->p, 0.0); + } + if(loss > 0) + min_C = delta2 * delta2 * loss / (8 * sum_y * sum_y * max_xTx); + else + min_C = INF; + } + + return pow( 2, floor(log(min_C) / log(2.0)) ); +} + +static double calc_max_p(const problem *prob) +{ + int i; + double max_p = 0.0; + for(i = 0; i < prob->l; i++) + max_p = max(max_p, fabs(prob->y[i])); + + return max_p; +} + +static void find_parameter_C(const problem *prob, parameter *param_tmp, double start_C, double max_C, double *best_C, double *best_score, const int *fold_start, const int *perm, const problem *subprob, int nr_fold) +{ + // variables for CV + int i; + double *target = Malloc(double, prob->l); + + // variables for warm start + double ratio = 2; + double **prev_w = Malloc(double*, nr_fold); + for(i = 0; i < nr_fold; i++) + prev_w[i] = NULL; + int num_unchanged_w = 0; + void (*default_print_string) (const char *) = liblinear_print_string; + + if(param_tmp->solver_type == L2R_LR || param_tmp->solver_type == L2R_L2LOSS_SVC) + *best_score = 0.0; + else if(param_tmp->solver_type == L2R_L2LOSS_SVR) + *best_score = INF; + *best_C = start_C; + + param_tmp->C = start_C; + while(param_tmp->C <= max_C) + { + //Output disabled for running CV at a particular C + set_print_string_function(&print_null); + + for(i=0; iinit_sol = prev_w[i]; + struct model *submodel = train(&subprob[i],param_tmp); + + int total_w_size; + if(submodel->nr_class == 2) + total_w_size = subprob[i].n; + else + total_w_size = subprob[i].n * submodel->nr_class; + + if(prev_w[i] == NULL) + { + prev_w[i] = Malloc(double, total_w_size); + for(j=0; jw[j]; + } + else if(num_unchanged_w >= 0) + { + double norm_w_diff = 0; + for(j=0; jw[j] - prev_w[i][j])*(submodel->w[j] - prev_w[i][j]); + prev_w[i][j] = submodel->w[j]; + } + norm_w_diff = sqrt(norm_w_diff); + + if(norm_w_diff > 1e-15) + num_unchanged_w = -1; + } + else + { + for(j=0; jw[j]; + } + + for(j=begin; jx[perm[j]]); + + free_and_destroy_model(&submodel); + } + set_print_string_function(default_print_string); + + if(param_tmp->solver_type == L2R_LR || param_tmp->solver_type == L2R_L2LOSS_SVC) + { + int total_correct = 0; + for(i=0; il; i++) + if(target[i] == prob->y[i]) + ++total_correct; + double current_rate = (double)total_correct/prob->l; + if(current_rate > *best_score) + { + *best_C = param_tmp->C; + *best_score = current_rate; + } + + info("log2c=%7.2f\trate=%g\n",log(param_tmp->C)/log(2.0),100.0*current_rate); + } + else if(param_tmp->solver_type == L2R_L2LOSS_SVR) + { + double total_error = 0.0; + for(i=0; il; i++) + { + double y = prob->y[i]; + double v = target[i]; + total_error += (v-y)*(v-y); + } + double current_error = total_error/prob->l; + if(current_error < *best_score) + { + *best_C = param_tmp->C; + *best_score = current_error; + } + + info("log2c=%7.2f\tp=%7.2f\tMean squared error=%g\n",log(param_tmp->C)/log(2.0),param_tmp->p,current_error); + } + + num_unchanged_w++; + if(num_unchanged_w == 5) + break; + param_tmp->C = param_tmp->C*ratio; + } + + if(param_tmp->C > max_C) + info("WARNING: maximum C reached.\n"); + free(target); + for(i=0; iparam = *param; model_->bias = prob->bias; - int nr_class; - int *label = NULL; - int *start = NULL; - int *count = NULL; - int *perm = Malloc(int,l); - - // group training data of the same class - group_classes(prob,&nr_class,&label,&start,&count,perm); - - model_->nr_class=nr_class; - model_->label = Malloc(int,nr_class); - for(i=0;ilabel[i] = label[i]; - - // calculate weighted C - double *weighted_C = Malloc(double, nr_class); - for(i=0;iC; - for(i=0;inr_weight;i++) + if(check_regression_model(model_)) { - for(j=0;jweight_label[i] == label[j]) - break; - if(j == nr_class) - fprintf(stderr,"WARNING: class label %d specified in weight is not found\n", param->weight_label[i]); + model_->w = Malloc(double, w_size); + + if(param->init_sol != NULL) + for(i=0;iw[i] = param->init_sol[i]; else - weighted_C[j] *= param->weight[i]; + for(i=0;iw[i] = 0; + + model_->nr_class = 2; + model_->label = NULL; + train_one(prob, param, model_->w, 0, 0); } - - // constructing the subproblem - feature_node **x = Malloc(feature_node *,l); - for(i=0;ix[perm[i]]; - - int k; - problem sub_prob; - sub_prob.l = l; - sub_prob.n = n; - sub_prob.x = Malloc(feature_node *,sub_prob.l); - sub_prob.y = Malloc(int,sub_prob.l); - - for(k=0; ksolver_type == MCSVM_CS) + else if(check_oneclass_model(model_)) { - model_->w=Malloc(double, n*nr_class); - for(i=0;ieps); - Solver.Solve(model_->w); + model_->w = Malloc(double, w_size); + model_->nr_class = 2; + model_->label = NULL; + solve_oneclass_svm(prob, param, model_->w, &(model_->rho)); } else { - if(nr_class == 2) + int nr_class; + int *label = NULL; + int *start = NULL; + int *count = NULL; + int *perm = Malloc(int,l); + + // group training data of the same class + group_classes(prob,&nr_class,&label,&start,&count,perm); + + model_->nr_class=nr_class; + model_->label = Malloc(int,nr_class); + for(i=0;ilabel[i] = label[i]; + + // calculate weighted C + double *weighted_C = Malloc(double, nr_class); + for(i=0;iC; + for(i=0;inr_weight;i++) { - model_->w=Malloc(double, w_size); + for(j=0;jweight_label[i] == label[j]) + break; + if(j == nr_class) + fprintf(stderr,"WARNING: class label %d specified in weight is not found\n", param->weight_label[i]); + else + weighted_C[j] *= param->weight[i]; + } - int e0 = start[0]+count[0]; - k=0; - for(; kx[perm[i]]; - train_one(&sub_prob, param, &model_->w[0], weighted_C[0], weighted_C[1]); + int k; + problem sub_prob; + sub_prob.l = l; + sub_prob.n = n; + sub_prob.x = Malloc(feature_node *,sub_prob.l); + sub_prob.y = Malloc(double,sub_prob.l); + + for(k=0; ksolver_type == MCSVM_CS) + { + model_->w=Malloc(double, n*nr_class); + for(i=0;ieps); + Solver.Solve(model_->w); } else { - model_->w=Malloc(double, w_size*nr_class); - double *w=Malloc(double, w_size); - for(i=0;iw=Malloc(double, w_size); + int e0 = start[0]+count[0]; k=0; - for(; kC); + if(param->init_sol != NULL) + for(i=0;iw[i] = param->init_sol[i]; + else + for(i=0;iw[i] = 0; - for(int j=0;jw[j*nr_class+i] = w[j]; + train_one(&sub_prob, param, model_->w, weighted_C[0], weighted_C[1]); } - free(w); + else + { + model_->w=Malloc(double, w_size*nr_class); + double *w=Malloc(double, w_size); + for(i=0;iinit_sol != NULL) + for(j=0;jinit_sol[j*nr_class+i]; + else + for(j=0;jC); + + for(j=0;jw[j*nr_class+i] = w[j]; + } + free(w); + } + } + free(x); + free(label); + free(start); + free(count); + free(perm); + free(sub_prob.x); + free(sub_prob.y); + free(weighted_C); } - - free(x); - free(label); - free(start); - free(count); - free(perm); - free(sub_prob.x); - free(sub_prob.y); - free(weighted_C); return model_; } -void cross_validation(const problem *prob, const parameter *param, int nr_fold, int *target) +void cross_validation(const problem *prob, const parameter *param, int nr_fold, double *target) { int i; - int *fold_start = Malloc(int,nr_fold+1); + int *fold_start; int l = prob->l; int *perm = Malloc(int,l); - + if (nr_fold > l) + { + nr_fold = l; + fprintf(stderr,"WARNING: # folds > # data. Will use # folds = # data instead (i.e., leave-one-out cross validation)\n"); + } + fold_start = Malloc(int,nr_fold+1); for(i=0;in; subprob.l = l-(end-begin); subprob.x = Malloc(struct feature_node*,subprob.l); - subprob.y = Malloc(int,subprob.l); + subprob.y = Malloc(double,subprob.l); k=0; for(j=0;jl; + int *perm = Malloc(int, l); + struct problem *subprob = Malloc(problem,nr_fold); + + if (nr_fold > l) + { + nr_fold = l; + fprintf(stderr,"WARNING: # folds > # data. Will use # folds = # data instead (i.e., leave-one-out cross validation)\n"); + } + fold_start = Malloc(int,nr_fold+1); + for(i=0;ibias; + subprob[i].n = prob->n; + subprob[i].l = l-(end-begin); + subprob[i].x = Malloc(struct feature_node*,subprob[i].l); + subprob[i].y = Malloc(double,subprob[i].l); + + k=0; + for(j=0;jx[perm[j]]; + subprob[i].y[k] = prob->y[perm[j]]; + ++k; + } + for(j=end;jx[perm[j]]; + subprob[i].y[k] = prob->y[perm[j]]; + ++k; + } + } + + struct parameter param_tmp = *param; + *best_p = -1; + if(param->solver_type == L2R_LR || param->solver_type == L2R_L2LOSS_SVC) + { + if(start_C <= 0) + start_C = calc_start_C(prob, ¶m_tmp); + double max_C = 1024; + start_C = min(start_C, max_C); + double best_C_tmp, best_score_tmp; + + find_parameter_C(prob, ¶m_tmp, start_C, max_C, &best_C_tmp, &best_score_tmp, fold_start, perm, subprob, nr_fold); + + *best_C = best_C_tmp; + *best_score = best_score_tmp; + } + else if(param->solver_type == L2R_L2LOSS_SVR) + { + double max_p = calc_max_p(prob); + int num_p_steps = 20; + double max_C = 1048576; + *best_score = INF; + + i = num_p_steps-1; + if(start_p > 0) + i = min((int)(start_p/(max_p/num_p_steps)), i); + for(; i >= 0; i--) + { + param_tmp.p = i*max_p/num_p_steps; + double start_C_tmp; + if(start_C <= 0) + start_C_tmp = calc_start_C(prob, ¶m_tmp); + else + start_C_tmp = start_C; + start_C_tmp = min(start_C_tmp, max_C); + double best_C_tmp, best_score_tmp; + + find_parameter_C(prob, ¶m_tmp, start_C_tmp, max_C, &best_C_tmp, &best_score_tmp, fold_start, perm, subprob, nr_fold); + + if(best_score_tmp < *best_score) + { + *best_p = param_tmp.p; + *best_C = best_C_tmp; + *best_score = best_score_tmp; + } + } + } + + free(fold_start); + free(perm); + for(i=0; ivalue; } + if(check_oneclass_model(model_)) + dec_values[0] -= model_->rho; if(nr_class==2) - return (dec_values[0]>0)?model_->label[0]:model_->label[1]; + { + if(check_regression_model(model_)) + return dec_values[0]; + else if(check_oneclass_model(model_)) + return (dec_values[0]>0)?1:-1; + else + return (dec_values[0]>0)?model_->label[0]:model_->label[1]; + } else { int dec_max_idx = 0; @@ -2110,15 +3308,15 @@ int predict_values(const struct model *model_, const struct feature_node *x, dou } } -int predict(const model *model_, const feature_node *x) +double predict(const model *model_, const feature_node *x) { double *dec_values = Malloc(double, model_->nr_class); - int label=predict_values(model_, x, dec_values); + double label=predict_values(model_, x, dec_values); free(dec_values); return label; } -int predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates) +double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates) { if(check_probability_model(model_)) { @@ -2130,7 +3328,7 @@ int predict_probability(const struct model *model_, const struct feature_node *x else nr_w = nr_class; - int label=predict_values(model_, x, prob_estimates); + double label=predict_values(model_, x, prob_estimates); for(i=0;inr_class==2 && model_->param.solver_type != MCSVM_CS) nr_w=1; @@ -2181,28 +3390,62 @@ int save_model(const char *model_file_name, const struct model *model_) fprintf(fp, "solver_type %s\n", solver_type_table[param.solver_type]); fprintf(fp, "nr_class %d\n", model_->nr_class); - fprintf(fp, "label"); - for(i=0; inr_class; i++) - fprintf(fp, " %d", model_->label[i]); - fprintf(fp, "\n"); + + if(model_->label) + { + fprintf(fp, "label"); + for(i=0; inr_class; i++) + fprintf(fp, " %d", model_->label[i]); + fprintf(fp, "\n"); + } fprintf(fp, "nr_feature %d\n", nr_feature); - fprintf(fp, "bias %.16g\n", model_->bias); + fprintf(fp, "bias %.17g\n", model_->bias); + + if(check_oneclass_model(model_)) + fprintf(fp, "rho %.17g\n", model_->rho); fprintf(fp, "w\n"); for(i=0; iw[i*nr_w+j]); + fprintf(fp, "%.17g ", model_->w[i*nr_w+j]); fprintf(fp, "\n"); } + setlocale(LC_ALL, old_locale); + free(old_locale); + if (ferror(fp) != 0 || fclose(fp) != 0) return -1; else return 0; } +// +// FSCANF helps to handle fscanf failures. +// Its do-while block avoids the ambiguity when +// if (...) +// FSCANF(); +// is used +// +#define FSCANF(_stream, _format, _var)do\ +{\ + if (fscanf(_stream, _format, _var) != 1)\ + {\ + fprintf(stderr, "ERROR: fscanf failed to read the model\n");\ + EXIT_LOAD_MODEL()\ + }\ +}while(0) +// EXIT_LOAD_MODEL should NOT end with a semicolon. +#define EXIT_LOAD_MODEL()\ +{\ + setlocale(LC_ALL, old_locale);\ + free(model_->label);\ + free(model_);\ + free(old_locale);\ + return NULL;\ +} struct model *load_model(const char *model_file_name) { FILE *fp = fopen(model_file_name,"r"); @@ -2213,18 +3456,31 @@ struct model *load_model(const char *model_file_name) int n; int nr_class; double bias; + double rho; model *model_ = Malloc(model,1); parameter& param = model_->param; + // parameters for training only won't be assigned, but arrays are assigned as NULL for safety + param.nr_weight = 0; + param.weight_label = NULL; + param.weight = NULL; + param.init_sol = NULL; model_->label = NULL; + char *old_locale = setlocale(LC_ALL, NULL); + if (old_locale) + { + old_locale = strdup(old_locale); + } + setlocale(LC_ALL, "C"); + char cmd[81]; while(1) { - fscanf(fp,"%80s",cmd); + FSCANF(fp,"%80s",cmd); if(strcmp(cmd,"solver_type")==0) { - fscanf(fp,"%80s",cmd); + FSCANF(fp,"%80s",cmd); int i; for(i=0;solver_type_table[i];i++) { @@ -2237,26 +3493,29 @@ struct model *load_model(const char *model_file_name) if(solver_type_table[i] == NULL) { fprintf(stderr,"unknown solver type.\n"); - free(model_->label); - free(model_); - return NULL; + EXIT_LOAD_MODEL() } } else if(strcmp(cmd,"nr_class")==0) { - fscanf(fp,"%d",&nr_class); + FSCANF(fp,"%d",&nr_class); model_->nr_class=nr_class; } else if(strcmp(cmd,"nr_feature")==0) { - fscanf(fp,"%d",&nr_feature); + FSCANF(fp,"%d",&nr_feature); model_->nr_feature=nr_feature; } else if(strcmp(cmd,"bias")==0) { - fscanf(fp,"%lf",&bias); + FSCANF(fp,"%lf",&bias); model_->bias=bias; } + else if(strcmp(cmd,"rho")==0) + { + FSCANF(fp,"%lf",&rho); + model_->rho=rho; + } else if(strcmp(cmd,"w")==0) { break; @@ -2266,13 +3525,12 @@ struct model *load_model(const char *model_file_name) int nr_class = model_->nr_class; model_->label = Malloc(int,nr_class); for(int i=0;ilabel[i]); + FSCANF(fp,"%d",&model_->label[i]); } else { fprintf(stderr,"unknown text in model file: [%s]\n",cmd); - free(model_); - return NULL; + EXIT_LOAD_MODEL() } } @@ -2293,9 +3551,12 @@ struct model *load_model(const char *model_file_name) { int j; for(j=0; jw[i*nr_w+j]); - fscanf(fp, "\n"); + FSCANF(fp, "%lf ", &model_->w[i*nr_w+j]); } + + setlocale(LC_ALL, old_locale); + free(old_locale); + if (ferror(fp) != 0 || fclose(fp) != 0) return NULL; return model_; @@ -2318,12 +3579,76 @@ void get_labels(const model *model_, int* label) label[i] = model_->label[i]; } +// use inline here for better performance (around 20% faster than the non-inline one) +static inline double get_w_value(const struct model *model_, int idx, int label_idx) +{ + int nr_class = model_->nr_class; + int solver_type = model_->param.solver_type; + const double *w = model_->w; + + if(idx < 0 || idx > model_->nr_feature) + return 0; + if(check_regression_model(model_) || check_oneclass_model(model_)) + return w[idx]; + else + { + if(label_idx < 0 || label_idx >= nr_class) + return 0; + if(nr_class == 2 && solver_type != MCSVM_CS) + { + if(label_idx == 0) + return w[idx]; + else + return -w[idx]; + } + else + return w[idx*nr_class+label_idx]; + } +} + +// feat_idx: starting from 1 to nr_feature +// label_idx: starting from 0 to nr_class-1 for classification models; +// for regression and one-class SVM models, label_idx is +// ignored. +double get_decfun_coef(const struct model *model_, int feat_idx, int label_idx) +{ + if(feat_idx > model_->nr_feature) + return 0; + return get_w_value(model_, feat_idx-1, label_idx); +} + +double get_decfun_bias(const struct model *model_, int label_idx) +{ + if(check_oneclass_model(model_)) + { + fprintf(stderr, "ERROR: get_decfun_bias can not be called for a one-class SVM model\n"); + return 0; + } + int bias_idx = model_->nr_feature; + double bias = model_->bias; + if(bias <= 0) + return 0; + else + return bias*get_w_value(model_, bias_idx, label_idx); +} + +double get_decfun_rho(const struct model *model_) +{ + if(check_oneclass_model(model_)) + return model_->rho; + else + { + fprintf(stderr, "ERROR: get_decfun_rho can be called only for a one-class SVM model\n"); + return 0; + } +} + void free_model_content(struct model *model_ptr) { - if(model_ptr->w != NULL) - free(model_ptr->w); - if(model_ptr->label != NULL) - free(model_ptr->label); + free(model_ptr->w); + model_ptr->w = NULL; + free(model_ptr->label); + model_ptr->label = NULL; } void free_and_destroy_model(struct model **model_ptr_ptr) @@ -2333,15 +3658,18 @@ void free_and_destroy_model(struct model **model_ptr_ptr) { free_model_content(model_ptr); free(model_ptr); + *model_ptr_ptr = NULL; } } void destroy_param(parameter* param) { - if(param->weight_label != NULL) - free(param->weight_label); - if(param->weight != NULL) - free(param->weight); + free(param->weight_label); + param->weight_label = NULL; + free(param->weight); + param->weight = NULL; + free(param->init_sol); + param->init_sol = NULL; } const char *check_parameter(const problem *prob, const parameter *param) @@ -2352,6 +3680,24 @@ const char *check_parameter(const problem *prob, const parameter *param) if(param->C <= 0) return "C <= 0"; + if(param->p < 0 && param->solver_type == L2R_L2LOSS_SVR) + return "p < 0"; + + if(prob->bias >= 0 && param->solver_type == ONECLASS_SVM) + return "prob->bias >=0, but this is ignored in ONECLASS_SVM"; + + if(param->regularize_bias == 0) + { + if(prob->bias != 1.0) + return "To not regularize bias, must specify -B 1 along with -R"; + if(param->solver_type != L2R_LR + && param->solver_type != L2R_L2LOSS_SVC + && param->solver_type != L1R_L2LOSS_SVC + && param->solver_type != L1R_LR + && param->solver_type != L2R_L2LOSS_SVR) + return "-R option supported only for solver L2R_LR, L2R_L2LOSS_SVC, L1R_L2LOSS_SVC, L1R_LR, and L2R_L2LOSS_SVR"; + } + if(param->solver_type != L2R_LR && param->solver_type != L2R_L2LOSS_SVC_DUAL && param->solver_type != L2R_L2LOSS_SVC @@ -2359,9 +3705,19 @@ const char *check_parameter(const problem *prob, const parameter *param) && param->solver_type != MCSVM_CS && param->solver_type != L1R_L2LOSS_SVC && param->solver_type != L1R_LR - && param->solver_type != L2R_LR_DUAL) + && param->solver_type != L2R_LR_DUAL + && param->solver_type != L2R_L2LOSS_SVR + && param->solver_type != L2R_L2LOSS_SVR_DUAL + && param->solver_type != L2R_L1LOSS_SVR_DUAL + && param->solver_type != ONECLASS_SVM) return "unknown solver type"; + if(param->init_sol != NULL + && param->solver_type != L2R_LR + && param->solver_type != L2R_L2LOSS_SVC + && param->solver_type != L2R_L2LOSS_SVR) + return "Initial-solution specification supported only for solvers L2R_LR, L2R_L2LOSS_SVC, and L2R_L2LOSS_SVR"; + return NULL; } @@ -2372,9 +3728,21 @@ int check_probability_model(const struct model *model_) model_->param.solver_type==L1R_LR); } +int check_regression_model(const struct model *model_) +{ + return (model_->param.solver_type==L2R_L2LOSS_SVR || + model_->param.solver_type==L2R_L1LOSS_SVR_DUAL || + model_->param.solver_type==L2R_L2LOSS_SVR_DUAL); +} + +int check_oneclass_model(const struct model *model_) +{ + return model_->param.solver_type == ONECLASS_SVM; +} + void set_print_string_function(void (*print_func)(const char*)) { - if (print_func == NULL) + if (print_func == NULL) liblinear_print_string = &print_string_stdout; else liblinear_print_string = print_func; diff --git a/liblinear/linear.def b/liblinear/linear.def index 2425996ce..c3b8d5174 100644 --- a/liblinear/linear.def +++ b/liblinear/linear.def @@ -16,3 +16,9 @@ EXPORTS check_parameter @14 check_probability_model @15 set_print_string_function @16 + get_decfun_coef @17 + get_decfun_bias @18 + check_regression_model @19 + find_parameters @20 + get_decfun_rho @21 + check_oneclass_model @22 diff --git a/liblinear/linear.h b/liblinear/linear.h index 2a1aa28d2..966ac4c55 100644 --- a/liblinear/linear.h +++ b/liblinear/linear.h @@ -1,10 +1,14 @@ #ifndef _LIBLINEAR_H #define _LIBLINEAR_H +#define LIBLINEAR_VERSION 247 + #ifdef __cplusplus extern "C" { #endif +extern int liblinear_version; + struct feature_node { int index; @@ -14,41 +18,47 @@ struct feature_node struct problem { int l, n; - int *y; + double *y; struct feature_node **x; - double bias; /* < 0 if no bias term */ + double bias; /* < 0 if no bias term */ }; -enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL }; /* solver_type */ +enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL, L2R_L2LOSS_SVR = 11, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL, ONECLASS_SVM = 21 }; /* solver_type */ struct parameter { int solver_type; /* these are for training only */ - double eps; /* stopping criteria */ + double eps; /* stopping tolerance */ double C; int nr_weight; int *weight_label; double* weight; + double p; + double nu; + double *init_sol; + int regularize_bias; }; struct model { struct parameter param; - int nr_class; /* number of classes */ + int nr_class; /* number of classes */ int nr_feature; double *w; - int *label; /* label of each class */ + int *label; /* label of each class */ double bias; + double rho; /* one-class SVM only */ }; struct model* train(const struct problem *prob, const struct parameter *param); -void cross_validation(const struct problem *prob, const struct parameter *param, int nr_fold, int *target); +void cross_validation(const struct problem *prob, const struct parameter *param, int nr_fold, double *target); +void find_parameters(const struct problem *prob, const struct parameter *param, int nr_fold, double start_C, double start_p, double *best_C, double *best_p, double *best_score); -int predict_values(const struct model *model_, const struct feature_node *x, double* dec_values); -int predict(const struct model *model_, const struct feature_node *x); -int predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates); +double predict_values(const struct model *model_, const struct feature_node *x, double* dec_values); +double predict(const struct model *model_, const struct feature_node *x); +double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates); int save_model(const char *model_file_name, const struct model *model_); struct model *load_model(const char *model_file_name); @@ -56,6 +66,9 @@ struct model *load_model(const char *model_file_name); int get_nr_feature(const struct model *model_); int get_nr_class(const struct model *model_); void get_labels(const struct model *model_, int* label); +double get_decfun_coef(const struct model *model_, int feat_idx, int label_idx); +double get_decfun_bias(const struct model *model_, int label_idx); +double get_decfun_rho(const struct model *model_); void free_model_content(struct model *model_ptr); void free_and_destroy_model(struct model **model_ptr_ptr); @@ -63,6 +76,8 @@ void destroy_param(struct parameter *param); const char *check_parameter(const struct problem *prob, const struct parameter *param); int check_probability_model(const struct model *model); +int check_regression_model(const struct model *model); +int check_oneclass_model(const struct model *model); void set_print_string_function(void (*print_func) (const char*)); #ifdef __cplusplus diff --git a/liblinear/newton.cpp b/liblinear/newton.cpp new file mode 100644 index 000000000..0fe3ccffc --- /dev/null +++ b/liblinear/newton.cpp @@ -0,0 +1,251 @@ +#include +#include +#include +#include +#include "newton.h" + +#ifndef min +template static inline T min(T x,T y) { return (x static inline T max(T x,T y) { return (x>y)?x:y; } +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +extern double dnrm2_(int *, double *, int *); +extern double ddot_(int *, double *, int *, double *, int *); +extern int daxpy_(int *, double *, double *, int *, double *, int *); +extern int dscal_(int *, double *, double *, int *); + +#ifdef __cplusplus +} +#endif + +static void default_print(const char *buf) +{ + fputs(buf,stdout); + fflush(stdout); +} + +// On entry *f must be the function value of w +// On exit w is updated and *f is the new function value +double function::linesearch_and_update(double *w, double *s, double *f, double *g, double alpha) +{ + double gTs = 0; + double eta = 0.01; + int n = get_nr_variable(); + int max_num_linesearch = 20; + double *w_new = new double[n]; + double fold = *f; + + for (int i=0;i= max_num_linesearch) + { + *f = fold; + return 0; + } + else + memcpy(w, w_new, sizeof(double)*n); + + delete [] w_new; + return alpha; +} + +void NEWTON::info(const char *fmt,...) +{ + char buf[BUFSIZ]; + va_list ap; + va_start(ap,fmt); + vsprintf(buf,fmt,ap); + va_end(ap); + (*newton_print_string)(buf); +} + +NEWTON::NEWTON(const function *fun_obj, double eps, double eps_cg, int max_iter) +{ + this->fun_obj=const_cast(fun_obj); + this->eps=eps; + this->eps_cg=eps_cg; + this->max_iter=max_iter; + newton_print_string = default_print; +} + +NEWTON::~NEWTON() +{ +} + +void NEWTON::newton(double *w) +{ + int n = fun_obj->get_nr_variable(); + int i, cg_iter; + double step_size; + double f, fold, actred; + double init_step_size = 1; + int search = 1, iter = 1, inc = 1; + double *s = new double[n]; + double *r = new double[n]; + double *g = new double[n]; + + const double alpha_pcg = 0.01; + double *M = new double[n]; + + // calculate gradient norm at w=0 for stopping condition. + double *w0 = new double[n]; + for (i=0; ifun(w0); + fun_obj->grad(w0, g); + double gnorm0 = dnrm2_(&n, g, &inc); + delete [] w0; + + f = fun_obj->fun(w); + fun_obj->grad(w, g); + double gnorm = dnrm2_(&n, g, &inc); + info("init f %5.3e |g| %5.3e\n", f, gnorm); + + if (gnorm <= eps*gnorm0) + search = 0; + + while (iter <= max_iter && search) + { + fun_obj->get_diag_preconditioner(M); + for(i=0; ilinesearch_and_update(w, s, &f, g, init_step_size); + + if (step_size == 0) + { + info("WARNING: line search fails\n"); + break; + } + + fun_obj->grad(w, g); + gnorm = dnrm2_(&n, g, &inc); + + info("iter %2d f %5.3e |g| %5.3e CG %3d step_size %4.2e \n", iter, f, gnorm, cg_iter, step_size); + + if (gnorm <= eps*gnorm0) + break; + if (f < -1.0e+32) + { + info("WARNING: f < -1.0e+32\n"); + break; + } + actred = fold - f; + if (fabs(actred) <= 1.0e-12*fabs(f)) + { + info("WARNING: actred too small\n"); + break; + } + + iter++; + } + + if(iter >= max_iter) + info("\nWARNING: reaching max number of Newton iterations\n"); + + delete[] g; + delete[] r; + delete[] s; + delete[] M; +} + +int NEWTON::pcg(double *g, double *M, double *s, double *r) +{ + int i, inc = 1; + int n = fun_obj->get_nr_variable(); + double one = 1; + double *d = new double[n]; + double *Hd = new double[n]; + double zTr, znewTrnew, alpha, beta, cgtol, dHd; + double *z = new double[n]; + double Q = 0, newQ, Qdiff; + + for (i=0; iHv(d, Hd); + dHd = ddot_(&n, d, &inc, Hd, &inc); + // avoid 0/0 in getting alpha + if (dHd <= 1.0e-16) + break; + + alpha = zTr/dHd; + daxpy_(&n, &alpha, d, &inc, s, &inc); + alpha = -alpha; + daxpy_(&n, &alpha, Hd, &inc, r, &inc); + + // Using quadratic approximation as CG stopping criterion + newQ = -0.5*(ddot_(&n, s, &inc, r, &inc) - ddot_(&n, s, &inc, g, &inc)); + Qdiff = newQ - Q; + if (newQ <= 0 && Qdiff <= 0) + { + if (cg_iter * Qdiff >= cgtol * newQ) + break; + } + else + { + info("WARNING: quadratic approximation > 0 or increasing in CG\n"); + break; + } + Q = newQ; + + for (i=0; i #include "linear.h" +int print_null(const char *s,...) {return 0;} + +static int (*info)(const char *fmt,...) = &printf; + struct feature_node *x; int max_nr_attr = 64; @@ -23,7 +27,7 @@ static int max_line_len; static char* readline(FILE *input) { int len; - + if(fgets(line,max_line_len,input) == NULL) return NULL; @@ -38,10 +42,12 @@ static char* readline(FILE *input) return line; } -void do_predict(FILE *input, FILE *output, struct model* model_) +void do_predict(FILE *input, FILE *output) { int correct = 0; int total = 0; + double error = 0; + double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0; int nr_class=get_nr_class(model_); double *prob_estimates=NULL; @@ -65,7 +71,7 @@ void do_predict(FILE *input, FILE *output, struct model* model_) labels=(int *) malloc(nr_class*sizeof(int)); get_labels(model_,labels); prob_estimates = (double *) malloc(nr_class*sizeof(double)); - fprintf(output,"labels"); + fprintf(output,"labels"); for(j=0;j=max_nr_attr-2) // need one more for index = -1 + if(i>=max_nr_attr-2) // need one more for index = -1 { max_nr_attr *= 2; x = (struct feature_node *) realloc(x,max_nr_attr*sizeof(struct feature_node)); @@ -131,7 +137,7 @@ void do_predict(FILE *input, FILE *output, struct model* model_) { int j; predict_label = predict_probability(model_,x,prob_estimates); - fprintf(output,"%d",predict_label); + fprintf(output,"%g",predict_label); for(j=0;jnr_class;j++) fprintf(output," %g",prob_estimates[j]); fprintf(output,"\n"); @@ -139,14 +145,29 @@ void do_predict(FILE *input, FILE *output, struct model* model_) else { predict_label = predict(model_,x); - fprintf(output,"%d\n",predict_label); + fprintf(output,"%.17g\n",predict_label); } if(predict_label == target_label) ++correct; + error += (predict_label-target_label)*(predict_label-target_label); + sump += predict_label; + sumt += target_label; + sumpp += predict_label*predict_label; + sumtt += target_label*target_label; + sumpt += predict_label*target_label; ++total; } - printf("Accuracy = %g%% (%d/%d)\n",(double) correct/total*100,correct,total); + if(check_regression_model(model_)) + { + info("Mean squared error = %g (regression)\n",error/total); + info("Squared correlation coefficient = %g (regression)\n", + ((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/ + ((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt)) + ); + } + else + info("Accuracy = %g%% (%d/%d)\n",(double) correct/total*100,correct,total); if(flag_predict_probability) free(prob_estimates); } @@ -156,7 +177,8 @@ void exit_with_help() printf( "Usage: predict [options] test_file model_file output_file\n" "options:\n" - "-b probability_estimates: whether to output probability estimates, 0 or 1 (default 0)\n" + "-b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only\n" + "-q : quiet mode (no outputs)\n" ); exit(1); } @@ -176,7 +198,10 @@ int main(int argc, char **argv) case 'b': flag_predict_probability = atoi(argv[i]); break; - + case 'q': + info = &print_null; + i--; + break; default: fprintf(stderr,"unknown option: -%c\n", argv[i-1][1]); exit_with_help(); @@ -207,7 +232,7 @@ int main(int argc, char **argv) } x = (struct feature_node *) malloc(max_nr_attr*sizeof(struct feature_node)); - do_predict(input, output, model_); + do_predict(input, output); free_and_destroy_model(&model_); free(line); free(x); diff --git a/liblinear/train.c b/liblinear/train.c index 4920313a0..7923a8d31 100644 --- a/liblinear/train.c +++ b/liblinear/train.c @@ -16,28 +16,45 @@ void exit_with_help() "Usage: train [options] training_set_file [model_file]\n" "options:\n" "-s type : set type of solver (default 1)\n" - " 0 -- L2-regularized logistic regression (primal)\n" - " 1 -- L2-regularized L2-loss support vector classification (dual)\n" - " 2 -- L2-regularized L2-loss support vector classification (primal)\n" - " 3 -- L2-regularized L1-loss support vector classification (dual)\n" - " 4 -- multi-class support vector classification by Crammer and Singer\n" - " 5 -- L1-regularized L2-loss support vector classification\n" - " 6 -- L1-regularized logistic regression\n" - " 7 -- L2-regularized logistic regression (dual)\n" + " for multi-class classification\n" + " 0 -- L2-regularized logistic regression (primal)\n" + " 1 -- L2-regularized L2-loss support vector classification (dual)\n" + " 2 -- L2-regularized L2-loss support vector classification (primal)\n" + " 3 -- L2-regularized L1-loss support vector classification (dual)\n" + " 4 -- support vector classification by Crammer and Singer\n" + " 5 -- L1-regularized L2-loss support vector classification\n" + " 6 -- L1-regularized logistic regression\n" + " 7 -- L2-regularized logistic regression (dual)\n" + " for regression\n" + " 11 -- L2-regularized L2-loss support vector regression (primal)\n" + " 12 -- L2-regularized L2-loss support vector regression (dual)\n" + " 13 -- L2-regularized L1-loss support vector regression (dual)\n" + " for outlier detection\n" + " 21 -- one-class support vector machine (dual)\n" "-c cost : set the parameter C (default 1)\n" + "-p epsilon : set the epsilon in loss function of SVR (default 0.1)\n" + "-n nu : set the parameter nu of one-class SVM (default 0.5)\n" "-e epsilon : set tolerance of termination criterion\n" - " -s 0 and 2\n" - " |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,\n" - " where f is the primal function and pos/neg are # of\n" - " positive/negative data (default 0.01)\n" - " -s 1, 3, 4 and 7\n" - " Dual maximal violation <= eps; similar to libsvm (default 0.1)\n" - " -s 5 and 6\n" - " |f'(w)|_1 <= eps*min(pos,neg)/l*|f'(w0)|_1,\n" - " where f is the primal function (default 0.01)\n" + " -s 0 and 2\n" + " |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,\n" + " where f is the primal function and pos/neg are # of\n" + " positive/negative data (default 0.01)\n" + " -s 11\n" + " |f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.0001)\n" + " -s 1, 3, 4, 7, and 21\n" + " Dual maximal violation <= eps; similar to libsvm (default 0.1 except 0.01 for -s 21)\n" + " -s 5 and 6\n" + " |f'(w)|_1 <= eps*min(pos,neg)/l*|f'(w0)|_1,\n" + " where f is the primal function (default 0.01)\n" + " -s 12 and 13\n" + " |f'(alpha)|_1 <= eps |f'(alpha0)|,\n" + " where f is the dual function (default 0.1)\n" "-B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)\n" + "-R : not regularize the bias; must with -B 1 to have the bias; DON'T use this unless you know what it is\n" + " (for -s 0, 2, 5, 6, 11)\n" "-wi weight: weights adjust the parameter C of different classes (see README for details)\n" "-v n: n-fold cross validation mode\n" + "-C : find parameters (C for -s 0, 2 and C, p for -s 11)\n" "-q : quiet mode (no outputs)\n" ); exit(1); @@ -55,7 +72,7 @@ static int max_line_len; static char* readline(FILE *input) { int len; - + if(fgets(line,max_line_len,input) == NULL) return NULL; @@ -73,12 +90,17 @@ static char* readline(FILE *input) void parse_command_line(int argc, char **argv, char *input_file_name, char *model_file_name); void read_problem(const char *filename); void do_cross_validation(); +void do_find_parameters(); struct feature_node *x_space; struct parameter param; struct problem prob; struct model* model_; int flag_cross_validation; +int flag_find_parameters; +int flag_C_specified; +int flag_p_specified; +int flag_solver_specified; int nr_fold; double bias; @@ -94,11 +116,15 @@ int main(int argc, char **argv) if(error_msg) { - fprintf(stderr,"Error: %s\n",error_msg); + fprintf(stderr,"ERROR: %s\n",error_msg); exit(1); } - if(flag_cross_validation) + if (flag_find_parameters) + { + do_find_parameters(); + } + else if(flag_cross_validation) { do_cross_validation(); } @@ -121,18 +147,63 @@ int main(int argc, char **argv) return 0; } +void do_find_parameters() +{ + double start_C, start_p, best_C, best_p, best_score; + if (flag_C_specified) + start_C = param.C; + else + start_C = -1.0; + if (flag_p_specified) + start_p = param.p; + else + start_p = -1.0; + + printf("Doing parameter search with %d-fold cross validation.\n", nr_fold); + find_parameters(&prob, ¶m, nr_fold, start_C, start_p, &best_C, &best_p, &best_score); + if(param.solver_type == L2R_LR || param.solver_type == L2R_L2LOSS_SVC) + printf("Best C = %g CV accuracy = %g%%\n", best_C, 100.0*best_score); + else if(param.solver_type == L2R_L2LOSS_SVR) + printf("Best C = %g Best p = %g CV MSE = %g\n", best_C, best_p, best_score); +} + void do_cross_validation() { int i; int total_correct = 0; - int *target = Malloc(int, prob.l); + double total_error = 0; + double sumv = 0, sumy = 0, sumvv = 0, sumyy = 0, sumvy = 0; + double *target = Malloc(double, prob.l); cross_validation(&prob,¶m,nr_fold,target); - - for(i=0;iindex = prob.n; + (prob.x[i]-2)->index = prob.n; x_space[j-2].index = prob.n; } else diff --git a/liblinear/tron.cpp b/liblinear/tron.cpp deleted file mode 100644 index bff50787e..000000000 --- a/liblinear/tron.cpp +++ /dev/null @@ -1,235 +0,0 @@ -#include -#include -#include -#include -#include "tron.h" - -#ifndef min -template static inline T min(T x,T y) { return (x static inline T max(T x,T y) { return (x>y)?x:y; } -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -extern double dnrm2_(int *, double *, int *); -extern double ddot_(int *, double *, int *, double *, int *); -extern int daxpy_(int *, double *, double *, int *, double *, int *); -extern int dscal_(int *, double *, double *, int *); - -#ifdef __cplusplus -} -#endif - -static void default_print(const char *buf) -{ - fputs(buf,stdout); - fflush(stdout); -} - -void TRON::info(const char *fmt,...) -{ - char buf[BUFSIZ]; - va_list ap; - va_start(ap,fmt); - vsprintf(buf,fmt,ap); - va_end(ap); - (*tron_print_string)(buf); -} - -TRON::TRON(const function *fun_obj, double eps, int max_iter) -{ - this->fun_obj=const_cast(fun_obj); - this->eps=eps; - this->max_iter=max_iter; - tron_print_string = default_print; -} - -TRON::~TRON() -{ -} - -void TRON::tron(double *w) -{ - // Parameters for updating the iterates. - double eta0 = 1e-4, eta1 = 0.25, eta2 = 0.75; - - // Parameters for updating the trust region size delta. - double sigma1 = 0.25, sigma2 = 0.5, sigma3 = 4; - - int n = fun_obj->get_nr_variable(); - int i, cg_iter; - double delta, snorm, one=1.0; - double alpha, f, fnew, prered, actred, gs; - int search = 1, iter = 1, inc = 1; - double *s = new double[n]; - double *r = new double[n]; - double *w_new = new double[n]; - double *g = new double[n]; - - for (i=0; ifun(w); - fun_obj->grad(w, g); - delta = dnrm2_(&n, g, &inc); - double gnorm1 = delta; - double gnorm = gnorm1; - - if (gnorm <= eps*gnorm1) - search = 0; - - iter = 1; - - while (iter <= max_iter && search) - { - cg_iter = trcg(delta, g, s, r); - - memcpy(w_new, w, sizeof(double)*n); - daxpy_(&n, &one, s, &inc, w_new, &inc); - - gs = ddot_(&n, g, &inc, s, &inc); - prered = -0.5*(gs-ddot_(&n, s, &inc, r, &inc)); - fnew = fun_obj->fun(w_new); - - // Compute the actual reduction. - actred = f - fnew; - - // On the first iteration, adjust the initial step bound. - snorm = dnrm2_(&n, s, &inc); - if (iter == 1) - delta = min(delta, snorm); - - // Compute prediction alpha*snorm of the step. - if (fnew - f - gs <= 0) - alpha = sigma3; - else - alpha = max(sigma1, -0.5*(gs/(fnew - f - gs))); - - // Update the trust region bound according to the ratio of actual to predicted reduction. - if (actred < eta0*prered) - delta = min(max(alpha, sigma1)*snorm, sigma2*delta); - else if (actred < eta1*prered) - delta = max(sigma1*delta, min(alpha*snorm, sigma2*delta)); - else if (actred < eta2*prered) - delta = max(sigma1*delta, min(alpha*snorm, sigma3*delta)); - else - delta = max(delta, min(alpha*snorm, sigma3*delta)); - - info("iter %2d act %5.3e pre %5.3e delta %5.3e f %5.3e |g| %5.3e CG %3d\n", iter, actred, prered, delta, f, gnorm, cg_iter); - - if (actred > eta0*prered) - { - iter++; - memcpy(w, w_new, sizeof(double)*n); - f = fnew; - fun_obj->grad(w, g); - - gnorm = dnrm2_(&n, g, &inc); - if (gnorm <= eps*gnorm1) - break; - } - if (f < -1.0e+32) - { - info("warning: f < -1.0e+32\n"); - break; - } - if (fabs(actred) <= 0 && prered <= 0) - { - info("warning: actred and prered <= 0\n"); - break; - } - if (fabs(actred) <= 1.0e-12*fabs(f) && - fabs(prered) <= 1.0e-12*fabs(f)) - { - info("warning: actred and prered too small\n"); - break; - } - } - - delete[] g; - delete[] r; - delete[] w_new; - delete[] s; -} - -int TRON::trcg(double delta, double *g, double *s, double *r) -{ - int i, inc = 1; - int n = fun_obj->get_nr_variable(); - double one = 1; - double *d = new double[n]; - double *Hd = new double[n]; - double rTr, rnewTrnew, alpha, beta, cgtol; - - for (i=0; iHv(d, Hd); - - alpha = rTr/ddot_(&n, d, &inc, Hd, &inc); - daxpy_(&n, &alpha, d, &inc, s, &inc); - if (dnrm2_(&n, s, &inc) > delta) - { - info("cg reaches trust region boundary\n"); - alpha = -alpha; - daxpy_(&n, &alpha, d, &inc, s, &inc); - - double std = ddot_(&n, s, &inc, d, &inc); - double sts = ddot_(&n, s, &inc, s, &inc); - double dtd = ddot_(&n, d, &inc, d, &inc); - double dsq = delta*delta; - double rad = sqrt(std*std + dtd*(dsq-sts)); - if (std >= 0) - alpha = (dsq - sts)/(std + rad); - else - alpha = (rad - std)/dtd; - daxpy_(&n, &alpha, d, &inc, s, &inc); - alpha = -alpha; - daxpy_(&n, &alpha, Hd, &inc, r, &inc); - break; - } - alpha = -alpha; - daxpy_(&n, &alpha, Hd, &inc, r, &inc); - rnewTrnew = ddot_(&n, r, &inc, r, &inc); - beta = rnewTrnew/rTr; - dscal_(&n, &beta, d, &inc); - daxpy_(&n, &one, r, &inc, d, &inc); - rTr = rnewTrnew; - } - - delete[] d; - delete[] Hd; - - return(cg_iter); -} - -double TRON::norm_inf(int n, double *x) -{ - double dmax = fabs(x[0]); - for (int i=1; i= dmax) - dmax = fabs(x[i]); - return(dmax); -} - -void TRON::set_print_string(void (*print_string) (const char *buf)) -{ - tron_print_string = print_string; -} diff --git a/liblinear/tron.h b/liblinear/tron.h deleted file mode 100644 index 3045c2e83..000000000 --- a/liblinear/tron.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef _TRON_H -#define _TRON_H - -class function -{ -public: - virtual double fun(double *w) = 0 ; - virtual void grad(double *w, double *g) = 0 ; - virtual void Hv(double *s, double *Hs) = 0 ; - - virtual int get_nr_variable(void) = 0 ; - virtual ~function(void){} -}; - -class TRON -{ -public: - TRON(const function *fun_obj, double eps = 0.1, int max_iter = 1000); - ~TRON(); - - void tron(double *w); - void set_print_string(void (*i_print) (const char *buf)); - -private: - int trcg(double delta, double *g, double *s, double *r); - double norm_inf(int n, double *x); - - double eps; - int max_iter; - function *fun_obj; - void info(const char *fmt,...); - void (*tron_print_string)(const char *buf); -}; -#endif