CSV:: Fichero Fuente CSV.cpp

00001 // CSV.cpp  (C) 2008 adolfo@di-mare.com
00002 
00003 #ifdef English_dox
00004 /** \file   CSV.cpp
00005     \brief  Implementation for \c CSV.h.
00006     \author Adolfo Di Mare <adolfo@di-mare.com>
00007     \date   2008
00008 */
00009 #endif
00010 
00011 #ifdef Spanish_dox
00012 /** \file   CSV.cpp
00013     \brief  Implementación para \c CSV.h.
00014     \author Adolfo Di Mare <adolfo@di-mare.com>
00015     \date   2008
00016 */
00017 #endif
00018 
00019 #include "CSV.h"
00020 
00021 #define COMMA  ','
00022 #define DQUOTE '"'
00023 #define LF '\n' // Line Feed
00024 #define CR '\r' // Carriage Return
00025 
00026 
00027 
00028 //    Actions for the finite automaton used to parse CSV input
00029 //    ========================================================
00030 //    [              ] ==> n=0; i=0; DATA[0] = "";
00031 //    [   csv=""     ] ==> ++n; DATA[n] = ""; ++i;
00032 //    [              ] ==> ++i;
00033 //    [   h+=        ] ==> DATA[n] += str[i]; ++i;
00034 //    [  h='""'      ] ==> DATA[n] = """"; ++i;
00035 //    [     END      ] ==> return n;
00036 //
00037 //            |  ',' '\n'  |    '"'     |     l      |
00038 //    delta() |  comma+LF  |  d-quote   |   letter   |
00039 //  ----------+------------+------------+------------+
00040 //   ==>  0   |     0      |     1      |     3      |
00041 //       init |   return   |            |  csv+=ch   |
00042 //  ----------+------------+------------+------------+
00043 //        1   |     1      |     2      |     1      |
00044 //   quoted(1)|  csv+=ch   |            |  csv+=ch   |
00045 //  ----------+------------+------------+------------+
00046 //        2   |     0      |     1      |     3      |
00047 //  inquote(2)|   return   |  csv+=ch   |  csv='""'  |
00048 //  ----------+------------+------------+------------+
00049 //        3   |     0      |     3      |     3      |
00050 //    regular |   return   |  csv+=ch   |  csv+=ch   |
00051 //  ----------+------------+------------+------------+
00052 
00053 #ifdef English_dox
00054 /** Scans input stream \c CIN and returns the next CSV value.
00055     - The retrieved value from \c CIN gets stored into \c csv.
00056     - Works with \c char, not tested for \c wchar_t.
00057     - Stops when \c CIN.fail() or when \c CIN.eof().
00058     - Will not remove any chars from the retrieved value.
00059 
00060     \return true when the CSV complies with RFC-4180.
00061 */
00062 #endif
00063 #ifdef Spanish_dox
00064 /** Obtiene del flujo de entrada \c CIN el siguiente valor CSV.
00065     - El valor obtenido de \c CIN queda almacenado en \c csv.
00066     - Trabaja bien con \c char, no ha sido probado para \c wchar_t.
00067     - Para cuando \c CIN.fail() o cuando \c CIN.eof().
00068     - No elimina ningún caracter del valor obtenido.
00069 
00070     \return true cuando el campo CSV sigue la especificación RFC-4180.
00071 */
00072 #endif
00073 bool automataCSV( std::string& csv, std::istream& CIN ) {
00074     csv.clear();
00075     if ( CIN.fail() || CIN.eof() ) { // see http://www.horstmann.com/cpp/pitfalls.html
00076         return false;
00077     }
00078     int state=0;  char ch;
00079     bool trailing_CR = false; // true when the last char was CR
00080     bool ret_val     = true;  // true while csv complies with RFC-4180
00081     for (;;) {
00082         CIN.get(ch);
00083         if ( CIN.fail() || CIN.eof() ) {
00084             return ret_val;
00085         }
00086         csv += ch;
00087 
00088         switch (state) {
00089         case 0: { // init
00090                 if ( ch == COMMA ) {
00091                     return ret_val;
00092                 }
00093                 else if ( ch == LF ) {
00094                     return ret_val;
00095                 }
00096                 else if ( ch == CR ) {
00097                     trailing_CR = true;
00098                     state = 3;
00099                 }
00100                 else if ( ch == DQUOTE ) {     //            |  ',' '\n'  |    '"'     |     l      |
00101                     state = 1;                 //    delta() |  comma+LF  |  d-quote   |   letter   |
00102                 }                              //  ----------+------------+------------+------------+
00103                 else { // letter               //   ==>  0   |     0      |     1      |     3      |
00104                     state = 3;                 //       init |   return   |            |  csv+=ch   |
00105                 }                              //  ----------+------------+------------+------------+
00106             }
00107             break;
00108 
00109         case 1: { // quote(1)
00110                 if ( ch == DQUOTE ) {          //            |  ',' '\n'  |    '"'     |     l      |
00111                     state = 2;                 //    delta() |  comma+LF  |  d-quote   |   letter   |
00112                 }                              //  ----------+------------+------------+------------+
00113             //  else { // letter COMMA LF      //        1   |     1      |     2      |     1      |
00114             //      state = 1;                 //   quoted(1)|  csv+=ch   |            |  csv+=ch   |
00115             //  }                              //  ----------+------------+------------+------------+
00116             }
00117             break;
00118 
00119         case 2: { // inquote(2)
00120                 if ( ch == COMMA ) {
00121                 //  state = 0;
00122                     return ret_val;
00123                 } else if ( ch == LF ) {
00124                 //  state = 0;
00125                     return ret_val;
00126                 }
00127                 else if ( trailing_CR ) { //  ["...""..."\r?...,] '?' after '\r'
00128                     trailing_CR = false;
00129                     ret_val = false;
00130                     state = 3;
00131                 }
00132                 else if ( ch == CR ) {
00133                     trailing_CR = true;
00134                 //  state = 2;
00135                 }
00136                 else if ( ch == DQUOTE ) {     //            |  ',' '\n'  |    '"'     |     l      |
00137                     state = 1;                 //    delta() |  comma+LF  |  d-quote   |   letter   |
00138                 }                              //  ----------+------------+------------+------------+
00139                 else { // letter (error)       //        2   |     0      |     1      |     3      |
00140                     ret_val = false;           //  inquote(2)|   return   |  csv+=ch   |  csv='""'  |
00141                     state = 3;                 //  ----------+------------+------------+------------+
00142                 }                              // [," ... "" "3x,] ==> error condition ["3]
00143             }
00144             break;
00145 
00146         case 3: { // regular
00147                 if ( ch == COMMA ) {
00148                 //  state = 0;
00149                     return ret_val;
00150                 }
00151                 else if ( ch == LF ) {
00152                 //  state = 0;
00153                     return ret_val;
00154                 }                              //            |  ',' '\n'  |    '"'     |     l      |
00155                 else { // letter               //    delta() |  comma+LF  |  d-quote   |   letter   |
00156                 //  state = 3;                 //  ----------+------------+------------+------------+
00157                 // swallows DQUOTE's && CR's   //        3   |     0      |     3      |     3      |
00158                 }                              //    regular |   return   |  csv+=ch   |  csv+=ch   |
00159             }                                  //  ----------+------------+------------+------------+
00160             break;
00161 
00162         } // swith (state)
00163     } // for (;;)
00164 
00165     return ret_val;
00166 }
00167 
00168 void singleDQUOTE( std::string & str );
00169 
00170 bool getNextCSV( std::string& csv, std::istream& CIN ) {
00171     bool correct = automataCSV( csv, CIN );
00172     bool ret_val = false;    // true if ( csv[ csv.length()-1 ] == LF )
00173     size_t N = csv.length(); // number of retrieved chars
00174 
00175     if ( correct ) {
00176         if ( csv.empty() ) {
00177             return ret_val;
00178         }
00179         N--; // last char
00180         if ( csv[N] == COMMA ) {
00181             csv.erase(N); // chop() trailing comma
00182         }
00183         else if( csv[N] == LF ) {
00184             ret_val = true;
00185             csv.erase(N); // chop() trailing LF
00186             if ( N>0 ) {
00187                 N--;
00188                 if( csv[N] == CR ) {
00189                     csv.erase(N); // chop() trailing CR
00190                 }
00191             }
00192         }
00193 
00194         if ( ! csv.empty() ) {
00195             if ( csv[0] == DQUOTE ) {
00196                 singleDQUOTE( csv ); // transfrom [""] ==> ["]
00197             }
00198         }
00199     }
00200 
00201     else {   // assert( correct == false );
00202         if ( N>0 ) {
00203             N--; // last char
00204             if ( csv[N] == COMMA ) {
00205                 csv.erase(N,1); // removes trailing comma
00206             }
00207         }
00208     }
00209     return ret_val;
00210 }
00211 
00212 void setQuotedCSV( std::string& res , const std::string& value ) {
00213     std::string::const_iterator ch;
00214     bool quote_surround = false;
00215     res.clear();
00216     for ( ch = value.begin(); ch != value.end(); ++ch ) {
00217         if ( isspace( *ch ) || *ch == COMMA ) {
00218             quote_surround = true;
00219         }
00220         else if ( *ch == DQUOTE ) {
00221             res += DQUOTE;
00222             quote_surround = true;
00223         }
00224         res += *ch;
00225     }
00226 
00227     if ( quote_surround ) {
00228         res = DQUOTE + res + DQUOTE;
00229     }
00230 }
00231 
00232 void trim( std::string & str ) {
00233     if ( str.empty() ) { // already trimmed
00234         return;
00235     }
00236 
00237     // find in-string range of chars str[i->j]
00238     std::string::size_type i = 0, LEN = str.length();
00239     while ( i < LEN ) {
00240         if ( isspace(str[i]) ) { // trim traling whitespace
00241             ++i;
00242         }
00243         else {
00244             break;
00245         }
00246     }
00247 
00248     std::string::size_type j = LEN;
00249     while ( j > 0 ) {
00250         --j;
00251         if ( ! isspace(str[j]) ) {
00252             break;
00253         }
00254     }
00255     // leave out leading and trailing whitespace
00256     str = str.substr(i,j-i+1);
00257 }
00258 
00259 void trimCSV( std::string & str ) {
00260     trim( str ); // 1) trim()
00261     if ( str.empty() )  {
00262         return;
00263     }
00264 
00265     // D-Quoted???
00266     std::string::size_type N = str.length()-1;
00267     if ( str[0] != DQUOTE || str[N] != DQUOTE )  {
00268         return;
00269     }
00270 
00271     // Substitute each double DQUOTE's by a single DQUOTE
00272     singleDQUOTE( str );
00273     return;
00274 }
00275 
00276 
00277 #ifdef English_dox
00278 /// Substitute each double DQUOTE's by a single DQUOTE within \c str.
00279 #endif
00280 #ifdef Spanish_dox
00281 /// Sustituey cada letra DQUOTE doble por una solaletra DQUOTE en \c str.
00282 #endif
00283 void singleDQUOTE( std::string & str ) {
00284     // Substitute each double DQUOTE's by a single DQUOTE
00285     std::string tmp;
00286     std::string::const_iterator from, next;
00287     for ( from = str.begin(); from != str.end(); ++from ) {
00288         tmp.push_back( *from );
00289         if ( *from == DQUOTE ) { // already copied the first
00290             next = from; next++;
00291             if ( next == str.end() ) {
00292                 break;
00293             }
00294             else if ( *next == DQUOTE ) {
00295                 from = next; // don´t copy the second DQUOTE
00296             }
00297         }
00298     }
00299     // Removed enclosing (outermost) DQUOTE's
00300     str = tmp.substr(1, tmp.length()-2);
00301     return;
00302 }
00303 
00304 void chop( std::string & str , char ch ) {
00305     if ( str.empty() ) { // nothing to chop
00306         return;
00307     }
00308     std::string::size_type N = str.length()-1;
00309     if ( str[N] == ch ) {
00310         str.erase(N); // removed if it's the last
00311     }
00312 }
00313 
00314 
00315 #if 0
00316 
00317 /// Test ==> \c rebuildDquote().
00318 void test_CSV::test_rebuildDquote() {
00319     void rebuildDquote( std::string & str );
00320     {{  // test::rebuildDquote()
00321         std::string s;
00322         s =  "\"" ; rebuildDquote(s);       // ["] ==> [""]
00323         assertTrue( s == "\"\"");
00324         s =  "\" \" \"" ; rebuildDquote(s); // [" " "] ==> ["" "" ""]
00325         assertTrue( s == "\"\" \"\" \"\"");
00326         s =  "3,4\"" ; rebuildDquote(s);    // [3,4"] ==> [3,4""]
00327         assertTrue( s == "3,4\"\"");
00328         s =  " ," ; rebuildDquote(s);       // [ ,] ==> [ ,]
00329         assertTrue( s == " ,");
00330     }}
00331     {   // A61196-A76944
00332         std::string s =  "\"2\",3, \r\n";        // ["2",3, \r\n]
00333         rebuildDquote(s);
00334         assertTrue( s ==  "\"\"2\"\",3, \r\n");  // ["2",3, \r\n] ==> [""2"",3, \r\n]
00335     }
00336 }
00337 
00338 #ifdef English_dox
00339 /** Scans \c str substituting \c '"' by 2 double-quotes \c [""].
00340     - Local routine used in the implementation of \c getNextCSV().
00341 
00342     \dontinclude test_CSV.cpp
00343     \skipline    test::rebuildDquote()
00344     \until       }}
00345     \see         test_CSV::test_rebuildDquote()
00346 */
00347 #endif
00348 #ifdef Spanish_dox
00349 /** Sustituye en \c str cada comilla doble \c '"' por 2 comillas dobles \c [""].
00350     - Rutina local useda en la implementación de \c getNextCSV().
00351 
00352     \dontinclude test_CSV.cpp
00353     \skipline    test::rebuildDquote()
00354     \until       }}
00355     \see         test_CSV::test_rebuildDquote()
00356 */
00357 #endif
00358 void rebuildDquote( std::string & str ) {
00359     std::string res;
00360     std::string::const_iterator ch;
00361     for ( ch = str.begin(); ch != str.end(); ++ch ) {
00362         res += *ch;
00363         if ( *ch == DQUOTE ) {
00364             res += DQUOTE;
00365         }
00366     }
00367     str = res;
00368 }
00369 
00370 bool getNextCSV_OLD( std::string& csv, std::istream& CIN ) {
00371     csv.clear();
00372     if ( CIN.fail() ) { // see http://www.horstmann.com/cpp/pitfalls.html
00373         return false;
00374     }
00375     int state=0; char ch;
00376     bool trailing_CR = false; // true when the last char was CR
00377     for (;;) {
00378         CIN.get(ch);
00379         if ( CIN.fail() ) {
00380             return false;
00381         }
00382 
00383         switch (state) {
00384         case 0: { // init
00385                 if ( ch == COMMA ) {
00386                 //  csv += COMMA; // removes COMMA from result string
00387                     return false;
00388                 }
00389                 else if ( ch == LF ) {
00390                 //  csv += LF;    // removes LF from result string
00391                     return true;
00392                 }
00393                 else if ( ch == CR ) {
00394                     trailing_CR = true;
00395                     csv += CR;
00396                     state = 3;
00397                 }
00398                 else if ( ch == DQUOTE ) {     //            |  ',' '\n'  |    '"'     |     l      |
00399                     state = 1;                 //    delta() |  comma+LF  |  d-quote   |   letter   |
00400                 }                              //  ----------+------------+------------+------------+
00401                 else { // letter               //   ==>  0   |     0      |     1      |     3      |
00402                     csv += ch;                 //       init |   return   |            |  csv+=ch   |
00403                     state = 3;                 //  ----------+------------+------------+------------+
00404                 }
00405             }
00406             break;
00407 
00408         case 1: { // quote(1)
00409                 if ( ch == DQUOTE ) {          //            |  ',' '\n'  |    '"'     |     l      |
00410                     state = 2;                 //    delta() |  comma+LF  |  d-quote   |   letter   |
00411                 }                              //  ----------+------------+------------+------------+
00412                 else { // letter COMMA LF      //        1   |     1      |     2      |     1      |
00413                     csv += ch;                 //   quoted(1)|  csv+=ch   |            |  csv+=ch   |
00414                 //  state = 1;                 //  ----------+------------+------------+------------+
00415                 }
00416             }
00417             break;
00418 
00419         case 2: { // inquote(2)
00420                 if ( ch == COMMA ) {
00421                 //  state = 0;
00422                     return false;
00423                 } else if ( ch == LF ) {
00424                 //  state = 0;
00425                     return true;
00426                 }
00427                 else if ( trailing_CR ) { //  ["...""..."\r?...,] '?' after '\r'
00428                     rebuildDquote( csv );
00429                     csv = DQUOTE + csv + DQUOTE + CR + ch;
00430                     trailing_CR = false;
00431                     state = 3;
00432                 }
00433                 else if ( ch == CR ) { // removes CR+LF at the end of line
00434                     trailing_CR = true;
00435                 //  csv += CR; // removes trailing CR+LF
00436                 //  state = 2;
00437                 }
00438                 else if ( ch == DQUOTE ) {     //            |  ',' '\n'  |    '"'     |     l      |
00439                     csv += DQUOTE;             //    delta() |  comma+LF  |  d-quote   |   letter   |
00440                     state = 1;                 //  ----------+------------+------------+------------+
00441                 }                              //        2   |     0      |     1      |     3      |
00442                 else { // letter (error)       //  inquote(2)|   return   |  csv+=ch   |  csv='""'  |
00443                     rebuildDquote( csv );      //  ----------+------------+------------+------------+
00444                     csv= DQUOTE + csv + DQUOTE + ch;  // [," ... "" "3x,] ==> error condition ["3]
00445                     state = 3;                        //  [" ... "" "3]  ==> rebuilt value
00446                 }
00447             }
00448             break;
00449 
00450         case 3: { // regular
00451                 if ( ch == COMMA ) {
00452                     return false;
00453                 } else if ( ch == LF ) {
00454                     if ( trailing_CR ) {
00455                         csv = csv.substr( 0, csv.length()-1 ); // chop( csv , CR );
00456                     }
00457                 //  state = 0;
00458                 //  csv += LF;
00459                     return true;
00460                 }
00461                 else if ( ch == CR ) {    // leaves CR at the end
00462                     trailing_CR = true;   // mark to remove later
00463                     csv += CR;
00464                 //  state = 3;
00465                 }                              //            |  ',' '\n'  |    '"'     |     l      |
00466                 else { // letter               //    delta() |  comma+LF  |  d-quote   |   letter   |
00467                     csv += ch;                 //  ----------+------------+------------+------------+
00468                 //  state = 3;                 //        3   |     0      |     3      |     3      |
00469                 // swallows DQUOTE's && CR's   //    regular |   return   |  csv+=ch   |  csv+=ch   |
00470                }                               //  ----------+------------+------------+------------+
00471             }
00472             break;
00473 
00474         } // swith (state)
00475     } // for (;;)
00476 
00477     return false;
00478 }
00479 
00480 #endif
00481 
00482 #ifdef English_dox
00483 /// Comma Separated Value (not used in this implementation).
00484 #endif
00485 #ifdef Spanish_dox
00486 /// Comma Separated Value (no usado en esta implementación).
00487 #endif
00488 namespace csv { } // trick to include it into the Doxygen documentation
00489 
00490 // Trick to force Doxygen to document these.
00491 // - They are at the end of file to avoid trouble.
00492 using namespace std;
00493 using namespace csv;
00494 
00495 // EOF: CSV.cpp