|
| 1 | +module CodeRay |
| 2 | +module Scanners |
| 3 | + |
| 4 | + class SAS < Scanner |
| 5 | + |
| 6 | + register_for :sas |
| 7 | + |
| 8 | + file_extension 'sas' |
| 9 | + |
| 10 | + # List all token kinds that are not considered to be running code |
| 11 | + KINDS_NOT_LOC = [ |
| 12 | + :comment |
| 13 | + ] # :nodoc: |
| 14 | + |
| 15 | + # See the WordList documentation. |
| 16 | + #CONSTANTS = %w( true false null ) |
| 17 | + #IDENT_KIND = WordList.new(:key).add(CONSTANTS, :value) |
| 18 | + |
| 19 | + #ESCAPE = / [bfnrt\\"\/] /x |
| 20 | + #UNICODE_ESCAPE = / u[a-fA-F0-9]{4} /x |
| 21 | + |
| 22 | + module Words # :nodoc: |
| 23 | + |
| 24 | + CONSTANT = %w[_n_ _null_] |
| 25 | + |
| 26 | + MACROBOUND = %w[%macro %mend] |
| 27 | + |
| 28 | + MACROSTATEMENT = %w[%abort %display %do %else %end %for %global %global %if %include %macro %mend |
| 29 | + %nrstr %put %quote %str %sysfunc %then %to %unquote %until %while %window] |
| 30 | + |
| 31 | + # for a complete list, see http://support.sas.com/documentation/cdl/en/allprodsproc/63875/HTML/default/viewer.htm#a003135046.htm |
| 32 | + # for extras included here (run, quit, etc.), see https://gist.github.com/cjdinger/7cf251399ef29b9b90b324a6fc442fca |
| 33 | + PROCNAME = %w[access aceclus allele anom anova append appsrv arima autoreg bmdp bom |
| 34 | + boxplot btl build calendar calis callrfc cancorr candisc capability |
| 35 | + casecontrol catalog catmod cdisc chart cimport clp cluster compare |
| 36 | + compile computab contents convert copula copy corr corresp countreg cpm |
| 37 | + cport cusum cv2view data datasets datasource db2ext db2util dbcstab dbf |
| 38 | + dbload define_event define_tagset dif discrim display distance docparse |
| 39 | + document download dqmatch dqscheme dqsrvadm dqsrvsvc dtree entropy esm |
| 40 | + expand explode export factex factor family fastclus fcmp fmm fontreg |
| 41 | + forecast format forms freq fsbrowse fsedit fsletter fslist fsview g3d |
| 42 | + g3grid ga gam ganno gantt gareabar gbarline gchart gcontour gdevice |
| 43 | + geneselect genmod geocode gfont gimport ginside gis gkeymap gkpi |
| 44 | + glimmix glm glmmod glmpower glmselect gmap goptions gplot gproject gradar |
| 45 | + greduce gremove greplay groovy gslide gtestit gtile haplotype hpcountreg |
| 46 | + hpdmdb hpds2 hpf hpfarimaspec hpfdiagnose hpfengine hpfesmspec hpfevents |
| 47 | + hpfexmspec hpfidmspec hpforest hpfreconcile hpfselect hpfucmspec hplmixed |
| 48 | + hplogistic hpmixed hpneural hpnlin hpreduce hpreg hpsample hpseverity |
| 49 | + hpsummary htsnp http iml import inbreed infomaps intpoint ishikawa items |
| 50 | + javainfo kde krige2d lattice lifereg lifetest loan loess logistic lp |
| 51 | + macontrol macro mapimport mcmc mdc mddb mds means mend metadata metalib |
| 52 | + metaoperate mi mianalyze migrate mixed modeclus model multtest nested |
| 53 | + netdraw netflow nlin nlmixed nlp npar1way olap olapoperate operate |
| 54 | + optex options optload optlp optmilp optmodel optqp optsave orthoreg |
| 55 | + panel pareto pdlreg pds pdscopy phreg plan plm plot pls pm pmenu power |
| 56 | + princomp prinqual print printto probit proc proto prtdef prtexp psmooth |
| 57 | + pwencode qdevice qlim quantreg quest quit rank rdc rdpool rdsec reg |
| 58 | + registry release reliability report risk robustreg rsreg run scaproc |
| 59 | + score seqdesign seqtest server severity sgdesign sgpanel sgplot sgrender |
| 60 | + sgscatter shewhart sim2d similarity simlin simnormal soap sort source |
| 61 | + spectra sql standard statespace statgraph stdize stepdisc stp summary |
| 62 | + surveyfreq surveylogistic surveymeans surveyphreg surveyreg surveyselect |
| 63 | + syslin tabulate tapecopy tapelabel tcalis template timeid timeplot |
| 64 | + timeseries tpspline trans transpose transreg trantab tree tscsreg |
| 65 | + tspl ttest ucm univariate upload userproc varclus varcomp variogram |
| 66 | + varmax vaxtointeg webmddb x11 x12 xsl] |
| 67 | + |
| 68 | + STATEMENT = %w[abort array attrib axis by class endrsubmit file filename footnote format freq goptions |
| 69 | + infile informat killtask legend libname listtask model note ods options pattern rdisplay |
| 70 | + rget rsubmit select signoff signon symbol sysecho systask table title waitfor where |
| 71 | + weight xaxis yaxis xaxis2 yaxis2] |
| 72 | + |
| 73 | + KEYWORD = %w[_all_ add alter array attrib axis bandplot barchart barchartparm |
| 74 | + bihistogram3dparm blockplot boxplot boxplotparm break by cards cards4 |
| 75 | + class clear close column columns compute continuouslegend contourplotparm |
| 76 | + data data declare define densityplot describe disconnect discretelegend |
| 77 | + distinct dm drop drop dropline dynamic edit edit ellipse ellipseparm |
| 78 | + else endcomp entry entryfootnote entrytitle excel execute filename |
| 79 | + footnote format format freq fringeplot from goptions graphics group |
| 80 | + histogram histogramparm html html html5 id if informat input insert |
| 81 | + into keep killtask lineparm listing listing listtask loessplot merge |
| 82 | + model modelband needleplot nloptions ods options options parmcards |
| 83 | + parmcards4 pbsplineplot powerpoint proc proc put rand ranks rbreak |
| 84 | + rdisplay referenceline regressionplot replace reset retain rget rtf run |
| 85 | + scatterplot scatterplotmatrix seriesplot set signoff signon stepplot |
| 86 | + style surfaceplotparm symbol sysecho table tables tagsets then title |
| 87 | + title unique update validate value var var vectorplot waitfor weight where] |
| 88 | + |
| 89 | + FUNCTION = %w[abs addr addrlong airy allcomb allcombi allperm anyalnum anyalpha |
| 90 | + anycntrl anydigit anyfirst anygraph anylower anyname anyprint anypunct |
| 91 | + anyspace anyupper anyxdigit arcos arcosh armend armgtid arminit armjoin |
| 92 | + armproc armstop armstrt armupdt arsin arsinh artanh ascebc atan atan2 |
| 93 | + attrc attrn band beta betainv blackclprc blackptprc blkshclprc blshift |
| 94 | + bnot bor bquote brshift bxor byte cat catq cats catt catx cdf ceil |
| 95 | + ceilz cexist char choosec choosen cinv close cmiss cmpres cnonct |
| 96 | + coalesce coalescec collate comb compare compbl compcost compged complev |
| 97 | + compound compress compstor constant convx convxp cos cosh count countc |
| 98 | + countw css curobs cv daccdb daccdbsl daccsl daccsyd dacctab dairy datatyp |
| 99 | + datdif date datejul datepart datetime day dclose dcreate delete depdb |
| 100 | + depdbsl depsl depsyd deptab dequote deviance dhms dif digamma dim dinfo |
| 101 | + divide dnum dopen doptname doptnum dqcase dqgender dqgenderinfoget |
| 102 | + dqgenderparsed dqidentify dqlocaleguess dqlocaleinfoget dqlocaleinfolist |
| 103 | + dqmatch dqmatchinfoget dqmatchparsed dqparse dqparseinfoget dqparsetokenget |
| 104 | + dqparsetokenput dqpattern dqschemeapply dqsrvarchjob dqsrvcopylog |
| 105 | + dqsrvdeletelog dqsrvjobstatus dqsrvkilljob dqsrvprofjobfile dqsrvprofjobrep |
| 106 | + dqsrvuser dqstandardize dqtoken dread dropnote dsname dur durp ebcasc |
| 107 | + effrate envlen erf erfc euclid eval exist exp fact fappend fclose fcol |
| 108 | + fdelete fetch fetchobs fexist fget fileattr fileexist filename fileref |
| 109 | + finance find findc findfile findw finfo finv fipname fipnamel fipstate |
| 110 | + first floor floorz fnonct fnote fopen foptname foptnum fpoint fpos fput |
| 111 | + fread frewind frlen fsep fuzz fwrite gaminv gamma garkhclprc garkhptprc |
| 112 | + gcd geodist geomean geomeanz getdvi getjpi getlog getmsg getoption getquota |
| 113 | + getsym getterm getvarc getvarn graycode grdsvc_enable grdsvc_getaddr |
| 114 | + grdsvc_getinfo grdsvc_getname grdsvc_nnodes harmean harmeanz hbound hms |
| 115 | + holiday hour htmldecode htmlencode ibessel ifc ifn index indexc indexw |
| 116 | + input inputc inputn int intcindex intck intcycle intfit intfmt intget |
| 117 | + intindex intnx intrr intseas intshift inttest intz invcdf iorcmsg iqr irr |
| 118 | + isnull jbessel juldate juldate7 kurtosis label lag largest lbound lcm lcomb |
| 119 | + left length lengthc lengthm lengthn lexcomb lexcombi lexperk lexperm lfact |
| 120 | + lgamma libname libref limmoment log log10 log1px log2 logbeta logcdf logistic |
| 121 | + logpdf logsdf lowcase lperm lpnorm mad margrclprc margrptprc max md5 mdy mean |
| 122 | + median min minute missing missing mod module modulec modulen modz month mopen |
| 123 | + mort msplint mvalid n netpv nliteral nmiss nodename nomrate notalnum notalpha |
| 124 | + notcntrl notdigit note notfirst notgraph notlower notname notprint notpunct |
| 125 | + notspace notupper notxdigit npv nrbquote nrquote nrstr nvalid nwkdom open |
| 126 | + ordinal pathname pctl pdf peek peekc peekclong peeklong perm point poisson |
| 127 | + poke pokelong probbeta probbnml probbnrm probchi probf probgam probhypr |
| 128 | + probit probmc probnegb probnorm probt propcase prxchange prxdebug prxfree |
| 129 | + prxmatch prxnext prxparen prxparse prxposn prxsubstr ptrlongadd put putc |
| 130 | + putlog putn putsym pvp qcmpres qleft qlowcase qscan qsubstr qsysfunc qtr |
| 131 | + qtrim quantile quote qupcase ranbin rancau rand ranexp rangam range rank |
| 132 | + rannor ranperk ranperm ranpoi rantbl rantri ranuni read_array rename repeat |
| 133 | + resolve reverse rewind right rms round rounde roundz run_macro run_sasfile |
| 134 | + saving savings scan scanq sdf second set setterm sign sin sinh skewness sleep |
| 135 | + smallest soapweb soapwebmeta soapwipservice soapwipsrs soapws soapwsmeta |
| 136 | + softmax solve sortc sortn soundex spedis sqrt squantile std stderr stdize |
| 137 | + stfips stname stnamel str streaminit strip subpad substr substrn sum sumabs |
| 138 | + superq symexist symget symglobl symlocal symput symputx sysevalf sysexist |
| 139 | + sysfunc sysget sysmsg sysparm sysprocessid sysprocessname sysprod sysrc |
| 140 | + system tan tanh termin termout time timepart timevalue tinv tnonct today |
| 141 | + translate transtrn tranwrd trigamma trim trimn trunc ttclose ttcontrl |
| 142 | + ttopen ttread ttwrite uniform unquote upcase urldecode urlencode uss |
| 143 | + uuidgen var varfmt varinfmt varlabel varlen varname varnum varray varrayx |
| 144 | + vartype verify vformat vformatd vformatdx vformatn vformatnx vformatw |
| 145 | + vformatwx vformatx vinarray vinarrayx vinformat vinformatd vinformatdx |
| 146 | + vinformatn vinformatnx vinformatw vinformatwx vinformatx vlabel vlabelx |
| 147 | + vlength vlengthx vms vname vnamex vnext vtype vtypex vvalue vvaluex week |
| 148 | + weekday whichc whichn write_array year yieldp yrdif yyq zipcity zipcitydistance |
| 149 | + zipfips zipname zipnamel zipstate] |
| 150 | + |
| 151 | + end |
| 152 | + |
| 153 | + def scan_tokens encoder, options |
| 154 | + |
| 155 | + # The scanner is always in a certain state, which is :initial by default. |
| 156 | + # We use local variables and symbols to maximize speed. |
| 157 | + state = :initial |
| 158 | + |
| 159 | + # stack, as a Ruby array |
| 160 | + stack = [] |
| 161 | + |
| 162 | + # Define more flags and variables as you need them. |
| 163 | + key_expected = false |
| 164 | + |
| 165 | + # The main loop; eos? is true when the end of the code is reached. |
| 166 | + until eos? |
| 167 | + |
| 168 | + # Depending on the state, we want to do different things. |
| 169 | + case state |
| 170 | + |
| 171 | + # Normally, we use this case. |
| 172 | + when :initial |
| 173 | + |
| 174 | + # match white space |
| 175 | + if match = scan(/ \s+ /x) |
| 176 | + encoder.text_token match, :space |
| 177 | + |
| 178 | + # DATALINES/CARDS/LINES: http://sascommunity.org/wiki/DATALINES_statement |
| 179 | + if match = scan(/ (^|[\r\n])\s*(?:(?:data)?lines|cards|datalines4);[\s\S]+?[\r\n](\s)*; /i) |
| 180 | + encoder.text_token match, :dataline |
| 181 | + |
| 182 | + # comments: see http://sascommunity.org/wiki/Comment_statement |
| 183 | + elseif match = scan(/ (^\s*|;\s*)\*.*; /m) or match = scan(/\/\*[\s\S]+?\*\//) |
| 184 | + or match = scan(/ %(^\s*|;\s*)\*.*; /m) |
| 185 | + encoder.text_token match, :comment |
| 186 | + |
| 187 | + # dates/datetimes are LIKE strings, but not quite |
| 188 | + # "1Jan2016"d, '3:14:15pm't, '31jul2001:9:27:05am'dt |
| 189 | + elseif match = scan(/ ['"][^'"]+['"](?:d|d?t)\b /i) |
| 190 | + encoder.text_token match, :datetime |
| 191 | + |
| 192 | + # operators: |
| 193 | + elsif match = scan(/ \*\*|\|\||!!|¦¦|<>|><|[~¬^<>]?=|[*\/+\-<>&\|!¦~¬^]|\b(?:eq|ne|gt|lt|ge|le|in|not)\b /ix) |
| 194 | + encoder.text_token match, :operator |
| 195 | + |
| 196 | + # match number; SAS isn't picky about types, |
| 197 | + # but we need to care about that in format/informat specifications |
| 198 | + # can look like Decimal (1.2e23) or hexadecimal (0c1x) |
| 199 | + elsif match = scan(/ (?:\B-|\b)(?:[\da-f]+x|\d+(?:\.\d+)?(?:e[+-]?\d+)?) /ix) |
| 200 | + encoder.text_token match, :number |
| 201 | + |
| 202 | + elsif match = scan(/["']/) |
| 203 | + # A "quoted" token was found, and we know whether it is a key or a string. |
| 204 | + state = :string |
| 205 | + # This opens a token group and encodes the delimiter token. |
| 206 | + encoder.begin_group state |
| 207 | + encoder.text_token match, :delimiter |
| 208 | + |
| 209 | + elseif match = scan( /[$%@.(){}\[\];,\\]/ ) |
| 210 | + encoder.text_token match, :punctuation |
| 211 | + |
| 212 | + else |
| 213 | + # If we reach invalid code, we try to discard |
| 214 | + # chars one by one and mark them as :error. |
| 215 | + encoder.text_token getch, :error |
| 216 | + end |
| 217 | + |
| 218 | + # String scanning is a bit more complicated, so we use another state for it. |
| 219 | + # The scanner stays in :string state until the string ends or an error occurs. |
| 220 | + when :string |
| 221 | + |
| 222 | + # Another if-elsif-else-switch, for strings this time. |
| 223 | + if match = scan(/[^\\"']+/) |
| 224 | + # Everything that is not \ or " is just string content. |
| 225 | + # includes new lines in SAS |
| 226 | + encoder.text_token match, :content |
| 227 | + |
| 228 | + elsif match = scan(/["']/) |
| 229 | + # A " or ' is found, which means this string is ending here. |
| 230 | + # A special token class, :delimiter, is used for tokens like this one. |
| 231 | + encoder.text_token match, :delimiter |
| 232 | + # Always close your token groups using the right token kind! |
| 233 | + encoder.end_group state |
| 234 | + # We're going back to normal scanning here. |
| 235 | + state = :initial |
| 236 | + |
| 237 | + elsif match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox) |
| 238 | + # A valid special character should be classified as :char. |
| 239 | + |
| 240 | + encoder.text_token match, :char |
| 241 | + elsif match = scan(/\\./m) |
| 242 | + # Anything else that is escaped (including \n, we use the m modifier) is |
| 243 | + # just content. |
| 244 | + encoder.text_token match, :content |
| 245 | + |
| 246 | + else |
| 247 | + # Nice for debugging. Should never happen. |
| 248 | + raise_inspect "else case \" reached; %p not handled." % [peek(1)], encoder |
| 249 | + end |
| 250 | + |
| 251 | + else |
| 252 | + # Nice for debugging. Should never happen. |
| 253 | + raise_inspect 'Unknown state: %p' % [state], encoder |
| 254 | + |
| 255 | + end |
| 256 | + end |
| 257 | + |
| 258 | + # If we still have a string or key token group open, close it. |
| 259 | + if [:string, :key].include? state |
| 260 | + encoder.end_group state |
| 261 | + end |
| 262 | + |
| 263 | + # Return the encoder. |
| 264 | + encoder |
| 265 | + end |
| 266 | + |
| 267 | + end |
| 268 | + |
| 269 | +end |
| 270 | +end |
0 commit comments