-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathleeXML.cpp
160 lines (141 loc) · 4.9 KB
/
leeXML.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
// vim: set expandtab tabstop=8 shiftwidth=8 foldmethod=marker:
/** @file leeXML.cpp
* Basado en ejemplo de expat
*
* @package Mt77
* @author Vladimir Támara Patiño. vtamara@pasosdeJesus.org
* Dominio público. 2009. Sin garantías
* http://creativecommons.org/licenses/publicdomain/
* @version $Id: leeXML.cpp,v 1.11 2010/01/18 16:12:50 vtamara Exp $
*/
#include <stdio.h>
#include <iostream>
#include "expat.h"
#include "NodoTrieS.hpp"
using namespace std;
#if defined(__amigaos__) && defined(__USE_INLINE__)
#include <proto/expat.h>
#endif
#ifdef XML_LARGE_SIZE
#if defined(XML_USE_MSC_EXTENSIONS) && _MSC_VER < 1400
#define XML_FMT_INT_MOD "I64"
#else
#define XML_FMT_INT_MOD "ll"
#endif
#else
#define XML_FMT_INT_MOD "l"
#endif
int profundidad = 0;
int numdoc = -1;
NodoTrieS *nodotries;
XML_Parser parser = NULL;
static void XMLCALL
startElement(void *userData, const char *name, const char **atts)
{
//clog << "OJO startElement(userData, " << name << ", atts)" << endl;
/*int i;
for (i = 0; i < profundidad; i++) {
clog << '\t';
}
clog << name;
for (i = 0; atts[i] != NULL; i+=2) {
clog << atts[i] << "->" << atts[i+1] << " ";
}
clog << endl; */
profundidad++;
}
/** Elimina espacios redundantes en s */
string
solopal(string s)
{
string r="";
bool eraesp = true;
for (uint32_t i = 0; i < s.length() ; i++) {
if (!isspace(s[i]) || !eraesp) {
r += s[i];
}
eraesp = isspace(s[i]);
}
return r;
}
long inipal = -1000; // Posición en la que comienza la palabra ultpal
string ultpal = "";
static void XMLCALL
endElement(void *userData, const char *name)
{
//long pa = XML_GetCurrentByteIndex(parser);
//clog << "OJO endElement(userData, " << name << ")" << endl;
if (ultpal != "") {
string nec = solopal(ultpal);
vector<string> pals=estalla(" ", ultpal);
//clog << "OJO nec=" << nec << ", ultpal=" << ultpal << endl;
long la = 0;
for(uint32_t i=0; i < pals.size(); i++) {
if (pals[i] != "") {
//clog << "OJO insertado " << i << ", " << pals[i] << " en " << name << ", posición " << inipal + la << endl;
nodotries->insertaConEtiqueta(pals[i],
name,
numdoc, inipal + la);
nodotries->insertaNormalizando(pals[i],
numdoc, inipal + la, true);
la += pals[i].length() + 1;
}
}
ultpal = "";
}
profundidad--;
}
static void XMLCALL
charHandler(void *userData, const char *s, int len)
{
if (len >= 0 && profundidad > 0 ) {
/*char *cad = (char *)malloc(sizeof(char) * (len + 1));
snprintf(cad, len, "%s", s);
clog << "OJO charHandler(userData, " << cad << ", " << len << ")" << endl;
free(cad); */
if (len > 0) {
//string pal = utf8_a_latin1(s, len);
string pal = string(s, len);
if (ultpal == "") {
long pa = XML_GetCurrentByteIndex(parser);
inipal = pa;
}
ultpal += pal;
}
}
}
/** Construye un trieS a partir de un XML*/
void leeXML(const char *na, long ndoc, NodoTrieS &t)
{
ASSERT(na!=NULL && na[0] != '\0' && strlen(na)<FILENAME_MAX);
ASSERT(ndoc >= 0);
numdoc = ndoc;
char buf[BUFSIZ];
parser = XML_ParserCreate("UTF-8");
if (XML_SetEncoding(parser, "UTF-8") != XML_STATUS_OK) {
throw "No puede ponerse codificación UTF-8" ;
}
int done;
int depth = 0;
XML_SetUserData(parser, &depth);
XML_SetElementHandler(parser, startElement, endElement);
XML_SetCharacterDataHandler(parser, charHandler);
nodotries = &t;
FILE *fh = fopen(na, "r");
if (fh == NULL) {
return;
}
do {
uint32_t len = (int)fread(buf, 1, sizeof(buf), fh);
done = len < sizeof(buf);
if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
fprintf(stderr,
"%s at line %" XML_FMT_INT_MOD "u\n",
XML_ErrorString(XML_GetErrorCode(parser)),
XML_GetCurrentLineNumber(parser));
return;
}
} while (!done);
XML_ParserFree(parser);
fclose(fh);
}