# This is a dummy implementation of the proposed field splitting # algorithm (witten in sh, so hopefully sh people can follow it) # to demonstrate that the algorithm as presented generates the # expected output (that generated by almost every shell). # This code knows that in the tests IFS=' ,' (space and comma) # and rather than handling that generically, which would be possible, # but messy, simply builds those two characters (literally) into the # implementation (space, as a IFS white space char, and comma as an # IFS char that is not white space). # Similarly the code "knows" that if there is a prefix in the field # (chars not to be treated as generated by an expansion, and hence # exepmt fmom splitting) that will be simply a single 'p' always, and # siumilarly a suffix will be 'q' - because of that we do not need to # have any method to indicate what part of the field is to be subject # to field splitting # In the following comments that start '##' are text lifted directly # from my proposed section 2.6.5 ("Field Splitting") text, which might # allow readers to match this algorithm with what is described there. # The results from this test match exactly the results from all shells # considered to operate correctly (the same output routine is used, and # the results compared with diff - with zero differences). S=' ' C=',' field_split() { ARG=$1 # the field that needs to be split set -- # the set of output fields, initially empty # IFS is defined (IFS=' ,') and not empty, IFS white space is ' ' # We simply know that! # C is our candidate field, # CD indicates the delimiter that terminated the candidate field # ' ' indicates the delimiter was IFS white space alone # ',' indicates the delimuter was a ',' (perhaps with white space) # '' indicates there has been no delimiter C= CD= ## Each expansion, or substitution shall be processed in order ## as follows [...] ## While the input is not empty... while test -n "${ARG}" do ## Consider the first remaining character of the input. ## If it is: ## a. A character that did not result from an unquoted ## expansion or substitution: ## b. A character in the input that is not a character in IFS: # since we know exactly what the IFS chars are, and that # chars that did not result from an expandion (etc) are not # IFS chars (our test cases ensure that) we don't need to # treat those two differently, just skip forward until we # get to an IFS char, or we run out, appending the non-IFS # chars to the candidate and removing them from the input. # here we only care about the current first char in ${ARG} while case "${ARG}" in '') break 2 # the end of the input, done ;; [\ ,]*) false # delimiter located, exit loop ;; *) TAIL=${ARG#?} # something else C=${C}${ARG%"${TAIL}"} # appended to candidate ARG=${TAIL} # removed from input ;; esac do : done # Now we are at the start of a delimiter in ARG, and the # candidate field is C # which kind of delimiter do we have? ## c. An IFS white space character: # assume the delim will be just IFS white space (case 'c') CD=' ' # and then skip any of that we find (repeating 'c' over & over) while case "${ARG}" in ' '*) ARG=${ARG#* };; *) false;; esac do :; done ## d. Another IFS character, not IFS white space: # Next if we have a non white space IFS char, # then it is the other kind of delimiter (case 'd' in the algo) case "${ARG}" in ,*) CD=, ; ARG=${ARG#,} # Remember we saw it, then remove # and skip any following IFS white space while case "${ARG}" in ' '*) ARG=${ARG#* };; *) false;; esac do :; done ;; esac # now a field has been delimited so we are subject to: ## At this point, if the candidate is not empty, or if a ## non IFS white space character was seen at step d, then ## the candidate becomes an output field. ## In either case, empty the candidate, and perform the ## next iteration. if test -n "${C}" # candicate is not empty (or...) => output then ## if the candidate is not empty ## then the candidate becomes an output field. set -- "$@" "'${C}'" # otherwise The candidate is empty, if it was delimited # by only IFS white space, then candidate is dropped elif test "${CD}" != ' ' then ## or if a non IFS white space character was seen ## then the candidate becomes an output field. set -- "$@" "''" # no need for $C, it is "" fi ## In either case, empty the candidate, and perform ## the next iteration. CD= C= done ## When the input is empty, if the candidate is not empty, it ## becomes an output field. if test -n "${C}" then # not an empty field after last delim, so it is included set -- "$@" "'${C}'" fi # return the split field, as a list of quoted words (to become fields) printf %s "$*" } args() { name=$1; shift printf '%s:\t%d:\t' "$name" "$#" printf '<%s>' "$@" printf '\n' } tst() { N=$1 eval set -- $(field_split "$2") args "$N" "$@" } W='abc' SW=' abc' WS='abc ' SWS=' abc ' CW=',abc' WC='abc,' CWC=',abc,' WSW='abc def' WSSW='abd def' WCW='abc,def' WCCW='abc,,def' WSCW='abc ,def' WCSW='abc, def' WSCSW='abc , def' WSCSCSW='abc , , def' WSCSCSWS='abc , , def ' WSCSCSWC='abc , , def,' SS=' ' SSS=' ' CC=',,' CCC=',,,' SC=' ,' CS=', ' SCCS=' ,, ' CSSC=', ,' SCWSCWCS=' ,abc ,def, ' SSCSSCSSCSS=' , , , ' tst W "$W" tst SW "$SW" tst WS "$WS" tst SWS "$SWS" tst CW "$CW" tst WC "$WC" tst CWC "$CWC" tst WSW "$WSW" tst WSSW "$WSSW" tst WCW "$WCW" tst WCCW "$WCCW" tst WSCW "$WSCW" tst WCSW "$WCSW" tst WSCSW "$WSCW" tst WSCSCSW "$WSCSCSW" tst WSCSCSWS "$WSCSCSWS" tst WSCSCSWC "$WSCSCSWC" tst S "$S" tst C "$C" tst SS "$SS" tst SSS "$SSS" tst CC "$CC" tst CCC "$CCC" tst SC "$SC" tst CS "$CS" tst SCCS "$SCCS" tst CSSC "$CSSC" tst SCWSCWCS "$SCWSCWCS" tst SSCSSCSSCSS "$SSCSSCSSCSS" tst pW "p${W}" tst pSW "p${SW}" tst pWS "p${WS}" tst pSWS "p${SWS}" tst pCW "p${CW}" tst pWC "p${WC}" tst pCWC "p${CWC}" tst pWSW "p${WSW}" tst pWSSW "p${WSSW}" tst pWCW "p${WCW}" tst pWCCW "p${WCCW}" tst pWSCW "p${WSCW}" tst pWCSW "p${WCSW}" tst pWSCSW "p${WSCW}" tst pWSCSCSW "p${WSCSCSW}" tst pWSCSCSWS "p${WSCSCSWS}" tst pWSCSCSWC "p${WSCSCSWC}" tst pS "p${S}" tst pC "p${C}" tst pSS "p${SS}" tst pSSS "p${SSS}" tst pSC "p${SC}" tst pCS "p${CS}" tst pCSSC "p${CSSC}" tst pSSS "p${SSS}" tst pCCC "p${CCC}" tst pSCCS "p${SCCS}" tst pSCWSCWCS "p${SCWSCWCS}" tst pSSCSSCSSCSS "p${SSCSSCSSCSS}" tst Wq "${W}q" tst SWq "${SW}q" tst WSq "${WS}q" tst SWSq "${SWS}q" tst CWq "${CW}q" tst WCq "${WC}q" tst CWCq "${CWC}q" tst WSWq "${WSW}q" tst WSSWq "${WSSW}q" tst WCWq "${WCW}q" tst WCCWq "${WCCW}q" tst WSCWq "${WSCW}q" tst WCSWq "${WCSW}q" tst WSCSWq "${WSCW}q" tst WSCSCSWq "${WSCSCSW}q" tst WSCSCSWSq "${WSCSCSWS}q" tst WSCSCSWCq "${WSCSCSWC}q" tst Sq "${S}q" tst Cq "${C}q" tst SSq "${SS}q" tst SSSq "${SSS}q" tst SCq "${SC}q" tst CSq "${CS}q" tst CSSCq "${CSSC}q" tst SSSq "${SSS}q" tst CCCq "${CCC}q" tst SCCSq "${SCCS}q" tst SCWSCWCSq "${SCWSCWCS}q" tst SSCSSCSSCSSq "${SSCSSCSSCSS}q" tst pWq "p${W}q" tst pSWq "p${SW}q" tst pWSq "p${WS}q" tst pSWSq "p${SWS}q" tst pCWq "p${CW}q" tst pWCq "p${WC}q" tst pCWCq "p${CWC}q" tst pWSWq "p${WSW}q" tst pWSSWq "p${WSSW}q" tst pWCWq "p${WCW}q" tst pWCCWq "p${WCCW}q" tst pWSCWq "p${WSCW}q" tst pWCSWq "p${WCSW}q" tst pWSCSWq "p${WSCW}q" tst pWSCSCSWq "p${WSCSCSW}q" tst pWSCSCSWSq "p${WSCSCSWS}q" tst pWSCSCSWCq "p${WSCSCSWC}q" tst pSq "p${S}q" tst pCq "p${C}q" tst pSSq "p${SS}q" tst pSSSq "p${SSS}q" tst pSCq "p${SC}q" tst pCSq "p${CS}q" tst pCSSCq "p${CSSC}q" tst pSSSq "p${SSS}q" tst pCCCq "p${CCC}q" tst pSCCSq "p${SCCS}q" tst pSCWSCWCSq "p${SCWSCWCS}q" tst pSSCSSCSSCSSq "p${SSCSSCSSCSS}q"