js can have attr (ed103f8d) · Commits · jan.koester / libhtmlpp

src/html.cpp

+83 −30

Original line number	Diff line number	Diff line
		@@ -1470,65 +1470,118 @@ size_t libhtmlpp::ScriptElement::parseElement(
		bool& termination
		){
		termination = false;

		el = std::make_unique<ScriptElement>();
		auto* self = static_cast<ScriptElement*>(el.get()); // _Script ist hier gültig
		auto* self = static_cast<ScriptElement*>(el.get());

		size_t i = start;
		if (i >= in.size() \|\| in[i] != '<') {
		// If it doesn't start with '<', we can't parse a tag here.
		return start;
		}

		// Helper to perform case-insensitive comparison
		auto iequals = [](char a, char b) {
		return std::tolower(static_cast<unsigned char>(a)) ==
		std::tolower(static_cast<unsigned char>(b));
		};

		// Helper to perform case-insensitive match for a keyword starting at 'pos'
		auto match_ci = [&](size_t pos, const char* k) -> bool {
		for (size_t i = 0; k[i]; ++i) {
		if (pos + i >= in.size()) return false;
		if (!iequals(in[pos + i], k[i])) return false;
		for (size_t j = 0; k[j]; ++j) {
		if (pos + j >= in.size() \|\| !iequals(in[pos + j], k[j])) {
		return false;
		}
		}
		return true;
		};

		size_t i = start;
		// --- 1. Validate Opening Tag Name (<script) ---
		++i; // Consume '<'

		if (i >= in.size() \|\| in[i] != '<') return i;
		// Skip leading whitespace after '<'
		while (i < in.size() && std::isspace(static_cast<unsigned char>(in[i]))) {
		++i;
		}

		++i; // '<'
		while (i < in.size() && std::isspace(static_cast<unsigned char>(in[i]))) ++i;
		const char* kw = "script";
		for (size_t k = 0; kw[k] && i < in.size(); ++k, ++i) {
		if (!iequals(in[i], kw[k])) {
		while (i < in.size() && in[i] != '>') ++i;
		if (i < in.size()) ++i; // '>'
		const char* tag_keyword = "script";
		size_t keyword_len = std::char_traits<char>::length(tag_keyword);

		if (i + keyword_len >= in.size() \|\| !match_ci(i, tag_keyword)) {
		// Tag name doesn't match "script"
		// Skip till next '>' and return the position after it.
		while (i < in.size() && in[i] != '>') {
		++i;
		}
		if (i < in.size()) {
		++i; // Consume '>'
		}
		return i;
		}
		i += keyword_len; // Consume "script"

		// --- 2. Extract Opening Tag and Attributes ---
		// Find the closing '>' of the opening tag.
		size_t tag_end = i;
		while (i < in.size() && in[i] != '>') {
		++i;
		}

		while (i < in.size() && in[i] != '>') ++i;
		if (i < in.size() && in[i] == '>') ++i; // '>' konsumieren
		// Capture the raw opening tag data (including '<script' and attributes) for serialization.
		if (i > start && in[i] == '>') {
		// Copy data from '<' (start) up to and including '>' (i)
		std::vector<char> raw_tag_data(in.begin() + start, in.begin() + i + 1);
		self->_serialelize(raw_tag_data);
		}

		if (i >= in.size() \|\| in[i] != '>') {
		// The tag was never closed (e.g., '<script src="..." EOF')
		return i;
		}

		++i; // Consume '>' and move to content start
		size_t content_begin = i;

		// --- 3. Extract Script Content (CDATA-like section) ---
		for (; i < in.size(); ++i) {
		// Look for the start of the closing tag sequence: </script
		if (in[i] == '<' && match_ci(i, "</script")) {
		size_t j = i + 8; // strlen("</script") == 8
		while (j < in.size() && in[j] != '>') ++j;
		size_t content_end = i;

		if (i > content_begin) {
		self->_Script.reserve(self->_Script.size() + (i - content_begin));
		std::copy(in.begin() + content_begin, in.begin() + i,
		std::back_inserter(self->_Script));
		// 3a. Extract content preceding the closing tag
		if (content_end > content_begin) {
		// FIX: Use std::vector::insert instead of non-existent append
		self->_Script.insert(self->_Script.end(),
		in.begin() + content_begin,
		in.begin() + content_end);
		}

		if (j < in.size() && in[j] == '>') {
		i = j + 1;
		// 3b. Find the end of the closing tag: </script>
		size_t closing_tag_end_pos = i + keyword_len + 2; // +2 for '</'

		// Skip any characters/whitespace between </script and the final '>'
		while (closing_tag_end_pos < in.size() && in[closing_tag_end_pos] != '>') {
		++closing_tag_end_pos;
		}

		if (closing_tag_end_pos < in.size() && in[closing_tag_end_pos] == '>') {
		i = closing_tag_end_pos + 1; // Position after '>'
		return i;
		}

		// If we found "</script" but not the final ">", return the last processed position.
		return closing_tag_end_pos;
		}
		}

		// --- 4. End of Input Reached ---
		// If the input ends without a closing </script> tag, capture the remaining content.
		if (in.size() > content_begin) {
		self->_Script.reserve(self->_Script.size() + (in.size() - content_begin));
		std::copy(in.begin() + content_begin, in.end(),
		std::back_inserter(self->_Script));
		// FIX: Use std::vector::insert instead of non-existent append
		self->_Script.insert(self->_Script.end(),
		in.begin() + content_begin,
		in.end());
		}

		return i;
		}