00001 // CSV.h (C) 2008 adolfo@di-mare.com 00002 00003 #ifdef English_dox 00004 /// Doxygen English documentation. 00005 #define English_dox "Doxygen English documentation" 00006 /// \def English_dox ///< Marks English documentation blocks. 00007 #endif 00008 #ifdef Spanish_dox 00009 /// Documentación en español. 00010 #define Spanish_dox "Documentación en español" 00011 /// \def Spanish_dox ///< Macro usado para que Doxygen genere documentación en español. 00012 #endif 00013 00014 #ifdef English_dox 00015 /** \file CSV.h 00016 \brief \c getNextCSV() and \c setQuotedCSV(): library to process CSV files. 00017 00018 CSV: Comma Separated Value. The CSV file format have been better defined 00019 by IETF with RFC-4180. 00020 - If the file complies with the format specified in RFC-4180 00021 it's CSV fields will be correctly extracted by these routines. 00022 - An effort was made to comply with RFC-4180. 00023 \see http://tools.ietf.org/html/rfc4180 00024 00025 There are 2 main routines to process CSV files: 00026 - To get values use function \c getNextCSV(). 00027 - To store values use function \c void setQuotedCSV() and store 00028 the resulting string. 00029 00030 Class \c CSV_line is a wrapper around these routines, but requires 00031 that no quoted Line Feed characters \c "\n" appear within each 00032 line in a CSV file. 00033 00034 \author Adolfo Di Mare <adolfo@di-mare.com> 00035 \date 2008 00036 */ 00037 #endif 00038 00039 #ifndef CSV_h 00040 #define CSV_h 00041 00042 #ifdef English_dox 00043 /// \def CSV_h ///< Avoids multiple inclusion. 00044 #endif 00045 #ifdef Spanish_dox 00046 /// \def CSV_h ///< Evita la inclusión múltiple. 00047 #endif 00048 00049 #ifdef __cplusplus // compatibility C <==> C++ 00050 00051 #include <string> 00052 #include <iostream> 00053 #include <vector> 00054 00055 // CSV: Comma Separated Values [IETF RFC-4180]. 00056 // namespace csv { 00057 00058 #ifdef English_dox 00059 /** Prepares \c value for output into a CSV file. 00060 - Stores a new value into string \c res. 00061 - Surrounds the result in double-quotes when \c value has whitespace. 00062 - Surrounds the result in double-quotes when \c value has double-quotes. 00063 - Surrounds the result in double-quotes when \c value has commas \c ",". 00064 - Substitutes any double-quotes \c '"' within \c value with 2 double-quotes \c [""]. 00065 - Works with \c char, not tested for \c wchar_t. 00066 00067 \dontinclude test_CSV.cpp 00068 \skipline test::setQuotedCSV() 00069 \until }} 00070 \see test_CSV::setQuotedCSV() 00071 */ 00072 #endif 00073 void setQuotedCSV( std::string& res , const std::string& value ); 00074 00075 #ifdef English_dox 00076 /** Scans input stream \c CIN and returns the next CSV value. 00077 - \c CIN should be open in \c std::ios::binary mode as chars are 00078 extracted one by one, using \c CIN.get(ch). 00079 - The retrieved value from \c CIN gets stored into \c csv. 00080 - Works with \c char, not tested for \c wchar_t. 00081 - Removes from \c csv the trailing (CR+LF or LF) 00082 ==> \c "\r\n" o \c "\n". 00083 - An effort was made to comply with RFC-4180. 00084 00085 \return true when the CSV field ends in \c "\n" (LF -> LineFeed). 00086 00087 \see http://tools.ietf.org/html/rfc4180 00088 \see http://www.horstmann.com/cpp/pitfalls.html 00089 00090 \dontinclude CSV_line.cpp 00091 \skipline test::getNextCSV() 00092 \until }} 00093 \see test_CSV::getNextCSV() 00094 */ 00095 #endif 00096 bool getNextCSV( std::string& csv, std::istream& CIN ); 00097 00098 #ifdef English_dox 00099 /** Deletes leading and trailing whitespace from \c "str". 00100 - It will alos delete characters <code>" \f\n\r\t\v"</code>. 00101 - Uses \c isspace(ch) to find out if a letter is whitespace. 00102 00103 \dontinclude test_CSV.cpp 00104 \skipline test::trim() 00105 \until }} 00106 \see test_CSV::test_trim() 00107 */ 00108 #endif 00109 void trim( std::string & str ); 00110 00111 #ifdef English_dox 00112 /** Converts an incorrect CSV field value into its probably correct value. 00113 - Strips out leading and trailing whitespace with \c trim(). 00114 - If the trimmed filed is surrounded by quotes it will try to replace 00115 every pair of double quotes \c [""] by a single doble quote \c ["]. 00116 - Will no verify that double quotes are correctly paired. 00117 00118 Sometimes a FILE.csv has quoted fields surrounded by whitespace. As these 00119 field values do not comply with RFC-4180, they are extracted by 00120 \c getNextCSV() as they come, with no whitespace removed and with their 00121 double quotes pairs intact. In the following example the string is 00122 enclosed in square parenthesis \c [..] instad of double quotes \c ["] 00123 for legibility: 00124 00125 \code 00126 ["zero", "if "" 1" , , " 3xt" \r\n] 00127 [....0.,........ 1..,2,.........3...] 00128 00129 csv field getNextCSV() trimCSV() 00130 +------------------+----------------+----------+ 00131 | ["zero"] | [zero] | [zero] | 00132 | [, "if "" 1" ] | [ "if "" 1" ] | [if " 1] | 00133 | [, ] | [ ] | [] | 00134 | [, " 3xt" \r\n] | [ " 3xt" ] | [ 3xt] | 00135 +------------------+----------------+----------+ 00136 \endcode 00137 00138 By common sense, the programmer would expect that these strings be 00139 returned as they appear in the \c trimCSV() column, but the fact 00140 of the matter is that the only one that complies with RFC-4180 is 00141 the first one. After using \c trimCSV() on the value returned by 00142 \c getNextCSV() the result is what is reasonbly expected. 00143 - Nonetheless, the values fields that contain line feeds \c "\r" 00144 or carriage returns \c "\n" are probably processed in a way 00145 different form what it is expected, even before they are passed 00146 as arguments to \c trimCSV(). It is wiser no to trust this routine 00147 as a complete solution to process CSV files that do not fully comply 00148 with RFC-4180. 00149 00150 \dontinclude test_CSV.cpp 00151 \skipline test::trimCSV() 00152 \until }} 00153 \see test_CSV::test_trimCSV() 00154 */ 00155 #endif 00156 void trimCSV( std::string & str ); 00157 00158 #ifdef English_dox 00159 /** Deletes \c ch when it is the trailing character in \c str. 00160 - The deleted character always is \c ch. 00161 00162 \dontinclude test_CSV.cpp 00163 \skipline test::chop() 00164 \until }} 00165 \see test_CSV::test_chop() 00166 */ 00167 #endif 00168 void chop( std::string & str , char ch=0 ); 00169 00170 // }; // namespace csv 00171 00172 #ifdef English_dox 00173 /// Defined by the C++ standard library 00174 namespace std { } // trick to include it into the Doxygen documentation 00175 #endif 00176 00177 #endif // __cplusplus 00178 00179 #include <stdlib.h> // NULL, etc. [C language] 00180 00181 #ifdef __cplusplus 00182 extern "C" { 00183 #endif 00184 00185 // put in here C declarations 00186 00187 #ifdef __cplusplus 00188 } 00189 #endif 00190 00191 #endif 00192 00193 // EOF: CSV.h