我正在构建一个 C 工具,它将搜索 XML 文件中的所有属性,并使用curl 来抓取它们并检查它们是否返回 404 错误或者它们是否正在工作。
到目前为止,我的代码的所有模块都运行良好,除了当我尝试将获取的 URL 列表存储在 urlStack 变量中时,我收到此警告:
crawler.c: In function ‘locXMl’:
crawler.c:119:34: warning: assignment to ‘char’ from ‘char *’ makes integer from pointer without a cast [-Wint-conversion]
119 | urlStack[stackIndex] = regSearch("<loc>[^<]*<\\/loc>",fileData);
当我运行编译的文件时,出现分段错误错误。
这是代码:
#include <stdio.h>
#include <curl/curl.h>
#include <string.h>
#include <regex.h>
int statusCode(char *addr);
char* regSearch(char* regPattern, char* hayStack);
char locXMl(char* fileName, char* urlStack );
int main(int argc, char *argv[]) {
char* urlStack[] = {};
//checking if the file name is specified
if(argc < 2){
printf("\nMissing file name.\n");
return 1;
} //else continue
//checking if the file exsits
FILE *file;
if((file = fopen(argv[1],"r")) == NULL )
{
printf("\nSpecified file does not exist\n");
printf("\n%s\n",argv[1]);
return 1;
}
//build the list of URLs in urlStack
locXMl(argv[1],*urlStack);
}
int statusCode(char *addr){
CURL *curl;
CURLcode res;
long response_code;
// Initialize curl
curl_global_init(CURL_GLOBAL_DEFAULT);
// Create a curl handle
curl = curl_easy_init();
if (curl) {
curl_easy_setopt(curl, CURLOPT_URL, addr);
curl_easy_setopt(curl, CURLOPT_NOBODY, 1);
res = curl_easy_perform(curl);
if(res == CURLE_OK) {
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);
}
curl_easy_cleanup(curl);
}
// Cleanup global curl resources
curl_global_cleanup();
return response_code;
}
/* Use regex to get matching url from the XML */
char* regSearch(char* regPattern, char* hayStack) {
regex_t regex;
int reti;
char *string = hayStack;
char *pattern = regPattern;
regmatch_t match;
// Compile the regular expression
reti = regcomp(®ex, pattern, 0);
if (reti) {
fprintf(stderr, "Could not compile regex\n");
}
// Execute the regular expression
reti = regexec(®ex, string, 1, &match, 0);
//print the found match
if (!reti) {
return string;
} else if (reti == REG_NOMATCH) {
// do something if the match is not found
} else {
fprintf(stderr, "Regex execution failed\n");
}
// Free the compiled regular expression
regfree(®ex);
}
/* get all the URLs from within the loc element in XML */
char locXMl(char* fileName, char* urlStack ){
FILE *fptr;
fptr = fopen(fileName,"r");
char fileData[100];
int stackIndex = 0;
while(fgets(fileData, 100, fptr)) {
if(regSearch("<loc>[^<]*<\\/loc>",fileData)){
urlStack[stackIndex] = regSearch("<loc>[^<]*<\\/loc>",fileData);
stackIndex++;
//printf("%s\n", regSearch("<loc>[^<]*<\\/loc>",fileData));
}
}
}
我尝试更改 urlStack 变量的声明方式,但问题仍然存在。期望将所有 URL 作为数组放入该变量中。
您需要在声明中添加一个额外的星号:
char locXMl(char* fileName, char** urlStack );
并且调用时没有星号:
locXMl(argv[1],urlStack);
当然,你必须为
urlStack
中的所有指针分配空间。
您还会收到有关不返回值的非 void 函数以及将字符串常量作为非常量 char* 传递的警告。但你没有问过他们。
我根据@SupportUkraine提供的答案修改了代码如下,这解决了我在这里提到的具体问题:
#include <stdio.h>
#include <curl/curl.h>
#include <string.h>
#include <regex.h>
#include <stdlib.h>
int statusCode(char *addr);
char* regSearch(char* regPattern, char* hayStack);
char** locXMl(char* fileName, char** urlStack );
int main(int argc, char *argv[]) {
char* urlStack[1000];
//checking if the file name is specified
if(argc < 2){
printf("\nMissing file name.\n");
return 1;
} //else continue
//checking if the file exsits
FILE *file;
if((file = fopen(argv[1],"r")) == NULL )
{
printf("\nSpecified file does not exist\n");
printf("\n%s\n",argv[1]);
return 1;
}
fclose(file);
//build the list of URLs in urlStack
char** data = locXMl(argv[1],urlStack);
printf("%s",urlStack[10]);
}
int statusCode(char *addr){
CURL *curl;
CURLcode res;
long response_code;
// Initialize curl
curl_global_init(CURL_GLOBAL_DEFAULT);
// Create a curl handle
curl = curl_easy_init();
if (curl) {
curl_easy_setopt(curl, CURLOPT_URL, addr);
curl_easy_setopt(curl, CURLOPT_NOBODY, 1);
res = curl_easy_perform(curl);
if(res == CURLE_OK) {
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);
}
curl_easy_cleanup(curl);
}
// Cleanup global curl resources
curl_global_cleanup();
return response_code;
}
/* Use regex to get matching url from the XML */
char* regSearch(char* regPattern, char* hayStack) {
regex_t regex;
int reti;
char *string = hayStack;
char *pattern = regPattern;
regmatch_t match;
// Compile the regular expression
reti = regcomp(®ex, pattern, 0);
if (reti) {
fprintf(stderr, "Could not compile regex\n");
}
// Execute the regular expression
reti = regexec(®ex, string, 1, &match, 0);
//print the found match
if (!reti) {
regfree(®ex);
return string;
} else if (reti == REG_NOMATCH) {
// do something if the match is not found
} else {
fprintf(stderr, "Regex execution failed\n");
}
// Free the compiled regular expression
regfree(®ex);
}
/* get all the URLs from within the loc element in XML */
char** locXMl(char* fileName, char** urlStack ){
FILE *fptr;
fptr = fopen(fileName,"r");
char fileData[100];
int stackIndex = 0;
while(fgets(fileData, 100, fptr)) {
if(regSearch("<loc>[^<]*<\\/loc>",fileData)){
urlStack[stackIndex] = strcat(regSearch("<loc>[^<]*<\\/loc>",fileData), "");
//printf("%s\n",urlStack[stackIndex]);
stackIndex++;
}
}
fclose(fptr);
return urlStack;
}