使用空格作为分隔符将字符串拆分为C / C ++中的字符串数组的更好方法

A better way to split a string into an array of strings in C/C++ using whitespace as a delimiter

对不起，我的C/C++不太好，但是下面的代码对我来说也是垃圾。它还有一个错误-当str="07/02/2010"被"0"终止时失败。我认为与其修复bug，不如重写它。在python中，它只是'kas
hjkfh kjsdjkasf'.split()。我知道这是C-ISH代码，但拆分字符串不会那么复杂！坚持相同的签名，不使用额外的库，我如何改进它-使它简短和甜蜜？我可以看出这段代码很难闻，例如，在结尾处总是有else子句。

失败的行：

1
2
3
4
5
6

_tcsncpy_s(
s.GetBuffer((int) (nIndex-nLast)),
nIndex-nLast,
psz+nLast,
(size_t) (nIndex-nLast)
);

字符串"07/02/2010"以"0"结尾时，它将尝试将11个字符写入只有10个字符长的缓冲区。

全功能：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114

#define

// This will return the text string as a string array
// This function is called from SetControlText to parse the
// text string into an array of CStrings that the control
// Gadgets will attempt to interpret

BOOL CLVGridDateTimeCtrl::ParseTextWithCurrentFormat(const CString& str, const CGXStyle* pOldStyle, CStringArray& strArray )
{
// Unused:
pOldStyle;

// we assume that the significant segments are seperated by space

// Please change m_strDelim to add other delimiters

CString s;

LPCTSTR psz = (LPCTSTR) str;

BOOL bLastCharSpace = FALSE;
DWORD size = str.GetLength()+1;

// (newline will start a new row, tab delimiter will
// move to the next column).
// parse buffer (DBCS aware)
for (DWORD nIndex = 0, nLast = 0; nIndex < size; nIndex += _tclen(psz+nIndex))
{
// check for a delimiter
if (psz[nIndex] == _T('\0') || _tcschr(_T("

"), psz[nIndex]) || _tcschr(_T(""), psz[nIndex])
||!_tcscspn(&psz[nIndex], (LPCTSTR)m_strDelim))
{
s.ReleaseBuffer();
s.Empty();
// abort parsing the string if next char
// is an end-of-string
if (psz[nIndex] == _T('\0'))
{
if (psz[nIndex] == _T('
') && psz[nIndex+1] == _T('
'))
nIndex++;

_tcsncpy_s(s.GetBuffer((int) (nIndex-nLast)),
nIndex-nLast,
psz+nLast,
(size_t) (nIndex-nLast));
CString temStr = s;
strArray.Add(temStr);
temStr.Empty();
break;
}

else if (_tcscspn(&psz[nIndex], (LPCTSTR)m_strDelim) == 0 && !bLastCharSpace)
{
if (psz[nIndex] == _T('
') && psz[nIndex+1] == _T('
'))
nIndex++;

_tcsncpy_s(s.GetBuffer((int) (nIndex-nLast)),
nIndex-nLast,
psz+nLast,
(size_t) (nIndex-nLast));
CString temStr = s;
strArray.Add(temStr);
temStr.Empty();
bLastCharSpace = TRUE;
// abort parsing the string if next char
// is an end-of-string
if (psz[nIndex+1] == _T('\0'))
break;

}
// Now, that the value has been copied to the cell,
// let's check if we should jump to a new row.
else if (_tcschr(_T(""), psz[nIndex]) && !bLastCharSpace)
{
if (psz[nIndex] == _T('
') && psz[nIndex+1] == _T('
'))
nIndex++;

_tcsncpy_s(s.GetBuffer((int) (nIndex-nLast)),
nIndex-nLast,
psz+nLast,
(size_t) (nIndex-nLast));
CString temStr = s;
strArray.Add(temStr);
temStr.Empty();
bLastCharSpace = TRUE;
// abort parsing the string if next char
// is an end-of-string
if (psz[nIndex+1] == _T('\0'))
break;
}

nLast = nIndex + _tclen(psz+nIndex);

}
else
{
// nLast = nIndex + _tclen(psz+nIndex);
bLastCharSpace = FALSE;
}
}
if (strArray.GetSize())
return TRUE;
else
return FALSE;
}

编辑：m_strDelim = _T(",");和此成员变量仅用于此函数。我想我现在看到了标记化技术的意义了——它试图解析一个日期和时间……等等，还有更多！下面是调用此函数的代码。请也帮我改进一下。我的一些同事声称C语言使他们没有比C++更具生产力。我以前觉得自己像个白痴，因为我不能对自己说同样的话。

// SetControlText will attempt to convert the text to a valid date first with
// the help of COleDateTime and then with the help of the Date control and the
// current format

BOOL CLVGridDateTimeCtrl::ConvertControlTextToValue(CString& str, ROWCOL nRow, ROWCOL nCol, const CGXStyle* pOldStyle)
{
CGXStyle* pStyle = NULL;
BOOL bSuccess = FALSE;

if (pOldStyle == NULL)
{
pStyle = Grid()->CreateStyle();
Grid()->ComposeStyleRowCol(nRow, nCol, pStyle);
pOldStyle = pStyle;
}

// allow only valid input
{
// First do this
CLVDateTime dt;

if (str.IsEmpty())
{
;
// if (Grid()->IsCurrentCell(nRow, nCol))
// Reset();
bSuccess = TRUE;
}
else if (dt.ParseDateTime(str,CLVGlobals::IsUSDateFormat()) && (DATE) dt != 0)
{
SetDateTime(dt);
if (m_bDateValueAsNumber)
str.Format(_T("%g"), (DATE) dt);
else
str = dt.Format();
bSuccess = TRUE;
}
else
{
// parse the string using the current format
CStringArray strArray;
if (!ParseTextWithCurrentFormat(str, pOldStyle, strArray))
return FALSE;

UpdateNullStatus(m_TextCtrlWnd);

SetFormat(m_TextCtrlWnd, *pOldStyle);

int nArrIndex = 0;
for(int i=0; i<m_TextCtrlWnd.m_gadgets.GetSize(); i++)
{
int val = m_TextCtrlWnd.m_gadgets[i]->GetValue();
// s.Empty();
if(m_TextCtrlWnd.m_gadgets[i]->IsKindOf(RUNTIME_CLASS(SECDTNumericGadget)))
{
// TRACE(_T("The value %s
"), strArray[nArrIndex]);
((CLVDTNumericGadget*)m_TextCtrlWnd.m_gadgets[i])->m_nNewValue = _ttoi(strArray[nArrIndex]);
nArrIndex++;
if (nArrIndex>strArray.GetUpperBound())
break;
}
else if(m_TextCtrlWnd.m_gadgets[i]->IsKindOf(RUNTIME_CLASS(SECDTListGadget)) && val!=-1)
{
int nIndex = ((CLVDTListGadget*)m_TextCtrlWnd.m_gadgets[i])->FindMatch(strArray[nArrIndex], ((CLVDTListGadget*)m_TextCtrlWnd.m_gadgets[i])->GetValue()+1);
if (nIndex!=-1)
{
// TRACE(_T("The value %s
"), strArray[nArrIndex]);
((CLVDTListGadget*)m_TextCtrlWnd.m_gadgets[i])->SetValue(nIndex);
nArrIndex++;
if (nArrIndex>strArray.GetUpperBound())
break;
}

}

CLVDBValue dbDate = m_TextCtrlWnd.GetDateTime();
if (dbDate.IsNull())
str = _T("");
else
{
CLVDateTime dt = (CLVDateTime)dbDate;
if (m_bDateValueAsNumber)
str.Format(_T("%g"), (DATE) dt);
else
str = dt.Format();
}
}
bSuccess = TRUE;
}
}

if (pStyle)
Grid()->RecycleStyle(pStyle);

return bSuccess;
}

字符串工具包库(strtk)为您的问题提供了以下解决方案：

1
2
3
4
5
6
7
8
9
10
11
12
13

#include <string>
#include <deque>
#include"strtk.hpp"
int main()
{
std::string data("kas
hjkfh kjsdjkasf");
std::deque<std::string> str_list;
strtk::parse(data,",

", str_list);
return 0;
}

这里有更多的例子

相关讨论

在C++中，使用EDOCX1 0可能最简单：

1
2
3
4
5
6
7
8

std::istringstream buffer("kas
hjkfh kjsdjkasf");

std::vector<std::string> strings;

std::copy(std::istream_iterator<std::string>(buffer),
std::istream_iterator<std::string>(),
std::back_inserter(strings));

我还没有尝试坚持完全相同的签名，主要是因为大多数都是非标准的，所以它一般不适用于C++。

另一种可能是使用Boost::tokenizer，尽管很明显这涉及到另一个库，所以我不会试图更详细地介绍它。

我不确定这是否符合"bizarro语法"。我可能要在这方面做点工作…

编辑：我知道了——改为初始化矢量：

1
2
3
4
5
6

std::istringstream buffer("kas
hjkfh kjsdjkasf");

std::vector<std::string> strings(
(std::istream_iterator<std::string>(buffer)),
std::istream_iterator<std::string>());

"bizarro"部分是，如果没有第一个参数的附加括号，这将调用"最麻烦的解析"，因此它将声明一个函数而不是定义一个向量。-)

edit2：就问题的编辑而言，几乎不可能直接回答——这取决于太多既不标准也不解释的类型(如cgxstyle、clvdatetime)。就我而言，我根本就无法理解这一点。当然，这看起来是一个相当糟糕的设计，让用户输入多少模棱两可的东西，然后试图解决混乱。最好使用只允许明确输入的控件，并且您可以直接读取一些包含日期和时间的字段。

edit3:执行拆分的代码(也将逗号视为分隔符)可以这样执行：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

#include <iostream>
#include <locale>
#include
#include <vector>
#include <sstream>

class my_ctype : public std::ctype<char> {
public:
mask const *get_table() {
// this copies the"classic" table used by <ctype.h>:
static std::vector<std::ctype<char>::mask>
table(classic_table(), classic_table()+table_size);

// Anything we want to separate tokens, we mark its spot in the table as 'space'.
table[','] = (mask)space;

// and return a pointer to the table:
return &table[0];
}
my_ctype(size_t refs=0) : std::ctype<char>(get_table(), false, refs) { }
};

int main() {
// put our data in a strea:
std::istringstream buffer("first kas
hjkfh kjsdjk,asf\tlast");

// Create a ctype object and tell the stream to use it for parsing tokens:
my_ctype parser;
buffer.imbue(std::locale(std::locale(), &parser));

// separate the stream into tokens:
std::vector<std::string> strings(
(std::istream_iterator<std::string>(buffer)),
std::istream_iterator<std::string>());

// copy the tokes to cout so we can see what we got:
std::copy(strings.begin(), strings.end(),
std::ostream_iterator<std::string>(std::cout,"
"));
return 0;
}

相关讨论

最好的方法是使用strtok。这个链接应该是关于如何使用它的不言自明的，您也可以使用多个分隔符。非常方便的C函数。

相关讨论

+ 1，但我敢肯定有人会有一个疯狂的C++解决方案，包括BIZARRO语法来拿走你的选票。
这是一个很好的解决方案。如果你想使用C++，没有什么"疯狂"的编码语言的优势。
如果strtok是正确的答案，你通常会问错问题。
@杰瑞，你什么意思？应该改为史汤克吗？
@哈米什：通常情况下，它应该是(除其他外)不修改其输入的东西，并且不具有如此容易出错的接口。
@杰瑞，对不起，又困惑了…那么，strtok可以修改它的输入，并且它有一个容易出错的接口？怎么会这样？我在哪里可以阅读更多关于这个的信息？
@哈米什：注意strtok怎么不说const char *str，这是因为它修改了输入字符串。
@哈米什：更准确地说，strtok将(至少在正常情况下)修改其输入——这就是它定义的工作方式，也是它容易出错的部分原因(例如，将字符串文本传递给它会导致未定义的行为)。要使用它的序列也是笨拙的——调用一次传递输入字符串，然后重复传递空值，只有当返回空值时才会停止。
关于strtok的一个小细节值得注意：它不是线程安全的。我不知道这是否真的适用于OP的问题，但仍然值得注意。
@汤姆：那真是一个实现细节。大多数POSIX系统上的版本不是.otoh，MS多线程库中的版本是线程安全的(尽管通过分配线程本地存储非常重要)。
通常情况下，修改strtok()的输入没有什么问题——在大多数情况下，您希望使用strtok()进行解析，而不再关心未解析的字符串。字符串字面上的抱怨也是假的；编译时常量字符串的运行时解析绝对是一个小问题。
我更担心线程安全而不是变异。我总是可以复制一个要变异的字符串，但我确实希望结果是正确的。
@Hamish Grubijan：大多数系统都提供了一个strtok()替代方案，可以以线程安全的方式使用，例如重新进入的strtok_r()。
啊，很好…那么，我可以在VS2010中这样做而不安装额外的东西吗？您介意用一个工作代码的例子发布一个单独的答案吗？
@Hamish Grubijan:strtok()是一个标准函数。我可以在几个小时内提供strtok()的示例代码，当代码对我可用时。这是你想要的，还是你在找strtok_r()？我从来没有和后者合作过，但如果你需要的话，我可以去看看。让我知道这是否有帮助，如果你还在研究这个问题。
@如果你有解决方案，我可以用它。我还没有修复这个bug；其他的事情更优先。
很抱歉，我一直很忙，但这是网上的一个例子。希望这有帮助。elook.org/programming/c/strtok.html(编辑)

对这个问题进行排序的一个非常重要的方法是使用qt库。如果您使用的是kde，那么它们已经安装好了。QString类有一个成员函数split，其工作方式与python版本类似。例如

1	QString("This is a string").split("", QString::SkipEmptyParts)

返回QStrings的QStringList：

1	["This","is","a","string"]

(用pythonic语法)。注意，第二个参数是必需的，否则如果单词被多个空格拆分，则将返回每个单独的参数。

一般来说，在Qt库的帮助下，Python的大多数简单性，例如简单的字符串解析和列表迭代，都可以很容易地处理，并且具有C++的功能。

相关讨论

比我的另一个答案更好的方法：TR1的regex特性。这里有一个小的教程让你开始学习。这个答案是C++，使用正则表达式(这也许是最好的/最简单的方法来分割一个字符串)，而且我最近自己使用过，所以我知道它是一个很好的工具。

您可以使用boost::algorithm::split。即。：

1
2
3
4
5

std::string myString;
std::vector<std::string> splitStrings;
boost::algorithm::split(splitStrings, myString, boost::is_any_of("

"));

相关讨论

在C/C++中解析字符串很少是一件简单的事情。您发布的方法看起来包含了相当多的"历史"。例如，您声明要在空白处拆分字符串。但该方法本身似乎在使用成员变量m_strdelim作为拆分决策的一部分。简单地替换方法可能会导致其他意想不到的问题。

使用一个现有的标记化类(比如这个Boost库)可以简化很多事情。

相关讨论