Commit d6634591 authored by Camilo Higuita's avatar Camilo Higuita

add gumboparser and more work on the link collector

parent fc0e3d2c
QGumboParser @ af16ad8e
Subproject commit af16ad8e4f627810d986d8daeb5b323e00c14a37
[Dolphin]
Timestamp=2018,7,13,12,53,13
Version=4
[Settings]
HiddenFilesShown=true
#ifndef HTMLTAG_H
#define HTMLTAG_H
#include "gumbo-parser/src/gumbo.h"
enum class HtmlTag
{
HTML = GUMBO_TAG_HTML,
HEAD,
TITLE,
BASE,
LINK,
META,
STYLE,
SCRIPT,
NOSCRIPT,
TEMPLATE,
BODY,
ARTICLE,
SECTION,
NAV,
ASIDE,
H1,
H2,
H3,
H4,
H5,
H6,
HGROUP,
HEADER,
FOOTER,
ADDRESS,
P,
HR,
PRE,
BLOCKQUOTE,
OL,
UL,
LI,
DL,
DT,
DD,
FIGURE,
FIGCAPTION,
MAIN,
DIV,
A,
EM,
STRONG,
SMALL,
S,
CITE,
Q,
DFN,
ABBR,
DATA,
TIME,
CODE,
VAR,
SAMP,
KBD,
SUB,
SUP,
I,
B,
U,
MARK,
RUBY,
RT,
RP,
BDI,
BDO,
SPAN,
BR,
WBR,
INS,
DEL,
IMAGE,
IMG,
IFRAME,
EMBED,
OBJECT,
PARAM,
VIDEO,
AUDIO,
SOURCE,
TRACK,
CANVAS,
MAP,
AREA,
MATH,
MI,
MO,
MN,
MS,
MTEXT,
MGLYPH,
MALIGNMARK,
ANNOTATION_XML,
SVG,
FOREIGNOBJECT,
DESC,
TABLE,
CAPTION,
COLGROUP,
COL,
TBODY,
THEAD,
TFOOT,
TR,
TD,
TH,
FORM,
FIELDSET,
LEGEND,
LABEL,
INPUT,
BUTTON,
SELECT,
DATALIST,
OPTGROUP,
OPTION,
TEXTAREA,
KEYGEN,
OUTPUT,
PROGRESS,
METER,
DETAILS,
SUMMARY,
MENU,
MENUITEM,
APPLET,
ACRONYM,
BGSOUND,
DIR,
FRAME,
FRAMESET,
NOFRAMES,
ISINDEX,
LISTING,
XMP,
NEXTID,
NOEMBED,
PLAINTEXT,
RB,
STRIKE,
BASEFONT,
BIG,
BLINK,
CENTER,
FONT,
MARQUEE,
MULTICOL,
NOBR,
SPACER,
TT,
RTC,
UNKNOWN,
LAST,
};
static_assert(int(HtmlTag::LAST) == int(GUMBO_TAG_LAST),
"HtmlTag should be identical with GUMBO_TAG, "
"may be you use incompatible version of gumbo-parser");
#endif // HTMLTAG_H
The MIT License (MIT)
Copyright (c) <year> <copyright holders>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
CONFIG += staticlib c++11
QMAKE_CFLAGS += -std=c99
HEADERS += \
$$PWD/qgumboattribute.h \
$$PWD/qgumbodocument.h \
$$PWD/HtmlTag.h \
$$PWD/qgumbonode.h \
$$PWD/gumbo-parser/src/attribute.h \
$$PWD/gumbo-parser/src/char_ref.h \
$$PWD/gumbo-parser/src/char_ref.rl \
$$PWD/gumbo-parser/src/error.h \
$$PWD/gumbo-parser/src/gumbo.h \
$$PWD/gumbo-parser/src/insertion_mode.h \
$$PWD/gumbo-parser/src/parser.h \
$$PWD/gumbo-parser/src/string_buffer.h \
$$PWD/gumbo-parser/src/string_piece.h \
$$PWD/gumbo-parser/src/tag_enum.h \
$$PWD/gumbo-parser/src/tag_gperf.h \
$$PWD/gumbo-parser/src/tag_sizes.h \
$$PWD/gumbo-parser/src/tag_strings.h \
$$PWD/gumbo-parser/src/token_type.h \
$$PWD/gumbo-parser/src/tokenizer_states.h \
$$PWD/gumbo-parser/src/tokenizer.h \
$$PWD/gumbo-parser/src/utf8.h \
$$PWD/gumbo-parser/src/util.h \
$$PWD/gumbo-parser/src/vector.h
SOURCES += \
$$PWD/qgumbodocument.cpp \
$$PWD/qgumbonode.cpp \
$$PWD/qgumboattribute.cpp \
$$PWD/gumbo-parser/src/attribute.c \
$$PWD/gumbo-parser/src/char_ref.c \
$$PWD/gumbo-parser/src/error.c \
$$PWD/gumbo-parser/src/parser.c \
$$PWD/gumbo-parser/src/string_buffer.c \
$$PWD/gumbo-parser/src/string_piece.c \
$$PWD/gumbo-parser/src/tag.c \
$$PWD/gumbo-parser/src/tokenizer.c \
$$PWD/gumbo-parser/src/utf8.c \
$$PWD/gumbo-parser/src/util.c \
$$PWD/gumbo-parser/src/vector.c
INCLUDEPATH += \
$$PWD
DEPENDPATH += \
$$PWD
DISTFILES += \
$$PWD/gumbo-parser/src/tag.in
## Introduction
If you need to parse HTML page in Qt application it can be a problem. Qt doesn't have a HTML parser. You can use [gumbo-parser]("https://github.com/google/gumbo-parser") developed by google but it was written in pure C and doesn't provide Qt-like interface. Therefore it is not so comfortable to work with. This small library solves the issue.
## Quick Start
The easiest way to use QGumboParser is add it to your project as git submodule.
To add the library use the following steps:
- Create Subdirs Project.
- Add application subproject. "Qt Console Application" for example
- Open project folder and create *libs* directory
- Run `git submodule add git@github.com:lagner/QGumboParser.git libs/QGumboParser` in terminal.
- Run `git submodule update --init --recursive`
- Add `SUBDIRS += libs/QGumboParser/QGumboParser` into the root project. QGumboParser have to appear in your project tree
- Right click to application project that need Html parser and hit Add library -> Internal library -> select QGumboParser in combobox. Click finish.
The library is ready to use.
Please pay attention to the library requires c++11 support(just add *"CONFIG += c++11"* into your .pro file).
## Example
```cpp
#include <QCoreApplication>
#include <QDebug>
#include <qgumbodocument.h>
#include <qgumbonode.h>
const char* HTML_PAGE = R"~("
<!DOCTYPE html>
<html>
<head>
<title>Title text</title>
<meta content="">
<style></style>
</head>
<body>
<h3>First header</h3>
<p>text text text</p>
<div class="content">
<h3>Nested header <a href="">with link</a></h3>
</div>
</body>
</html>
")~";
int main()
{
auto doc = QGumboDocument::parse(HTML_PAGE);
auto root = doc.rootNode();
auto nodes = root.getElementsByTagName(HtmlTag::TITLE);
Q_ASSERT(nodes.size() == 1);
auto title = nodes.front();
qDebug() << "title is: " << title.innerText();
nodes = root.getElementsByTagName(HtmlTag::H3);
for (const auto& node: nodes) {
qDebug() << "h3: " << node.innerText();
}
auto container = root.getElementsByClassName("content");
Q_ASSERT(container.size() == 1);
auto children = container.front().children();
for (const auto& node: children) {
qDebug() << "Tag: " << node.tagName();
}
return 0;
}
```
## License
MIT License. See LICENSE file
\ No newline at end of file
---
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -1
AlignAfterOpenBracket: false
AlignEscapedNewlinesLeft: true
AlignOperands: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
AllowShortFunctionsOnASingleLine: All
AlwaysBreakAfterDefinitionReturnType: false
AlwaysBreakTemplateDeclarations: true
AlwaysBreakBeforeMultilineStrings: true
BreakBeforeBinaryOperators: None
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BinPackParameters: true
BinPackArguments: true
ColumnLimit: 80
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
DerivePointerAlignment: true
ExperimentalAutoDetectBinPacking: false
IndentCaseLabels: true
IndentWrappedFunctionNames: false
IndentFunctionDeclarationAfterType: false
MaxEmptyLinesToKeep: 1
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakString: 1000
PenaltyBreakFirstLessLess: 120
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
SpacesBeforeTrailingComments: 2
Cpp11BracedListStyle: true
Standard: Auto
IndentWidth: 2
TabWidth: 8
UseTab: Never
BreakBeforeBraces: Attach
SpacesInParentheses: false
SpacesInSquareBrackets: false
SpacesInAngles: false
SpaceInEmptyParentheses: false
SpacesInCStyleCastParentheses: false
SpaceAfterCStyleCast: true
SpacesInContainerLiterals: true
SpaceBeforeAssignmentOperators: true
ContinuationIndentWidth: 4
CommentPragmas: '^ IWYU pragma:'
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
SpaceBeforeParens: ControlStatements
DisableFormat: false
...
[Dolphin]
Timestamp=2018,7,13,12,53,28
Version=4
[Settings]
HiddenFilesShown=true
# Compilation artifacts
*.o
*.lo
*.la
# Editor swap files
*.swp
*.swo
*.swn
#emacs editor leftovers
*.*~
#diff leftovers
*.orig
# gtest pieces
gtest
gtest-1.7.0
# Other build artifacts
/Debug
/visualc/Debug
/visualc/Release
/visualc/gumbo.sdf
/visualc/gumbo.opensdf
/build
.log
.sdf
.opensdf
.deps
.dirstamp
.libs
Makefile
Makefile.in
aclocal.m4
autom4te.cache
compile
config.guess
config.log
config.status
config.sub
configure
depcomp
gumbo.pc
gumbo_test
gumbo_test.log
gumbo_test.trs
install-sh
libtool
ltmain.sh
m4/
missing
test-driver
test-suite.log
# gyp android artifacts
gumbo_parser.target.mk
# `make dist` artifacts
/gumbo-[0-9].[0-9].tar.gz
/gumbo-[0-9].[0-9]/
# Python dist artifacts
*.pyc
*.dylib
dist
build
python/gumbo.egg-info
python/gumbo/libgumbo.so
# Example binaries
benchmark
clean_text
find_links
get_title
positions_of_class
prettyprint
serialize
language: c++
compiler:
- gcc
- clang
os:
- linux
- osx
install:
- wget 'https://googletest.googlecode.com/files/gtest-1.7.0.zip'
- unzip gtest-1.7.0.zip
- ln -s gtest-1.7.0 gtest
- sudo pip install BeautifulSoup
- sudo pip install html5lib==0.95
script:
- ./autogen.sh && ./configure && make && make check
- python python/gumbo/gumboc_test.py
- python python/gumbo/html5lib_adapter_test.py
- python python/gumbo/soup_adapter_test.py
- sudo make install
- g++ examples/clean_text.cc `pkg-config --cflags --libs gumbo`
- sudo python setup.py sdist install
- python -c 'import gumbo; gumbo.parse("Foo")'
Gumbo 0.10.1 (2015-04-30)
Same as 0.10.0, but with the version number bumped because the last version-number commit to v0.9.4 makes GitHub think that v0.9.4 is the latest version and so it's not highlighted on the webpage.
Gumbo 0.10.0 (2015-04-30)
* Full support for <template> tag (kevinhendricks, nostrademons).
* Some fixes for <rtc>/<rt> handling (kevinhendricks, vmg).
* All html5lib-trunk tests pass now! (kevinhendricks, vmg, nostrademons)
* Support for fragment parsing (vmg)
* A couple additional example programs (kevinhendricks)
* Performance improvements totaling an estimated 30-40% total improvement (vmg, nostrademons).
Gumbo 0.9.4 (2015-04-30)
* Additional Visual Studio fixes (lowjoel, nostrademons)
* Fixed some unused variable warnings.
* Fix for glibtoolize vs. libtoolize build errors on Mac.
* Fixed CDATA end tag handling.
Gumbo 0.9.3 (2015-02-17)
* Bugfix for &AElig; entities (rgrove)
* Fix CDATA handling; CDATA sections now generate a GUMBO_NODE_CDATA node rather
than plain text.
* Fix get_title example to handle whitespace nodes (gsnedders)
* Visual Studio compilation fixes (fishioon)
* Take the namespace into account when determining whether a node matches a
certain tag (aroben)
* Replace the varargs tag functions with a tagset bytevector, for a 20-30%
speedup in overall parse time (kevinhendricks, vmg)
* Add MacOS X support to Travis CI, and fix the deployment/DLL issues this
uncovered (nostrademons, kevinhendricks, vmg)
Gumbo 0.9.2 (2014-09-21)
* Performance improvements: Ragel-based char ref decoder and DFA-based UTF8
* decoder, totaling speedups of up to 300%.
* Added benchmarking program and some sample data.
* Fixed a compiler error under Visual Studio.
* Fix an error in the ctypes bindings that could lead to memory corruption in
* the Python bindings.
* Fix duplicate attributes when parsing <isindex> tags.
* Don't leave semicolons behind when consuming entity references (rgrove)
* Internally rename some functions in preparation for an amalgamation file
(jdeng)
* Add proper cflags for gyp builds (skabbes)
Gumbo 0.9.1 (2014-08-07)
* First version listed on PyPi.
* Autotools files excluded from GitHub and generated via autogen.sh. (endgame)
* Numerous compiler warnings fixed. (bnoordhuis, craigbarnes)
* Google security audit passed.
* Gyp support (tfarina)
* Naming convention for structs changed to avoid C reserved words.
* Fix several integer and buffer overflows (Maxime2)
* Some Visual Studio compiler support (bugparty)
* Python3 compatibility for the ctypes bindings.
Gumbo 0.9.0 (2013-08-13)
* Initial release open-sourced by Google.
Contributing
===========
Bug reports are very much welcome. Please use GitHub's issue-tracking feature, as it makes it easier to keep track of bugs and makes it possible for other project watchers to view the existing issues.
Patches and pull requests are also welcome, but before accepting patches, I need you to sign the Google Contributor License Agreement:
https://developers.google.com/open-source/cla/individual
https://developers.google.com/open-source/cla/corporate
(Electronic signatures are fine for individual contributors.)
If you're unwilling to do this, it would be most helpful if you could file bug reports that include detailed prose about where in the code the error is and how to fix it, but leave out exact source code.
Project priorities
==================
Gumbo's priorities are, in rough order:
1. Conformance to the HTML5 spec
2. Security & stability
3. Compatibility, both with previous versions and with different platforms (Visual Studio, Linux, Mac, other language bindings)
4. API simplicity
5. Performance
6. Features
Patches are much more likely to be accepted if they don't jeopardize values higher in the list for the sake of ones lower in the list. So, we will happily take performance improvements that we can get for free, but not at the expense of complicating the API or reducing conformance. We take patches to improve simplicity, but not at the expense of backwards compatibility. We take new features only if they don't jeopardize any of the other traits (and are often quite conservative with them because of the backwards-compatibility and simplicity guarantees).
If you have a need for additional features beyond Gumbo's basic API, one option is to wrap Gumbo with another library, translating its data structures into ones more appropriate for your own use-case and then throwing away the original parse tree. Gumbo was built for this; it's why most of the data structures are simple structs and the parse tree is intended to be immutable. Tree traversal overhead is measured as negligible (~2-3%) compared to parsing time. See eg. gumbo-libxml or gumbo-query for examples.
Code hygiene
============
We accept bugfixes even without this, but here are things you can do to make our lives as maintainers easier:
1. Write unit tests. When you discover a bug, write a test case that exposes the bug first, and then fix the bug. We use both Travis CI (Mac/Linux) and AppVeyor (Visual Studio) for continuous integration, and both automatically run against pull requests; this makes it much easier to verify that the patch is correct.
2. Break big changes up into smaller, atomic pull requests, each of which fixes one bug or adds one feature. In particular, separate out performance improvements from bug/correctness fixes from new features, and separate out pull requests that need to go into different branches.
3. Work off the correct branch, and submit the pull request against it. We follow Semantic Versioning, which means that we bump the patch number for internal-only bugfixes and performance improvements, the minor number for backwards-compatible API changes, and the major number for backwards-incompatible API changes. There are branches for all of these.
4. For performance improvements, run benchmarks before and after, and include the numbers in the commit message. There is a benchmarking program at ./benchmark that runs the parser against some common webpages.
5. Run clang-format after making your changes to make sure your code conforms to the existing style. There is a .clang-format file already in the codebase; run it with 'clang-format -i src/*.{h,c}'.
This diff is collapsed.
These are a couple of debugging notes that may be helpful for anyone developing
Gumbo or trying to diagnose a tricky problem. They will probably not be
necessary for normal clients of this library - Gumbo is relatively stable, and
bugs are often rare and obscure. However, they're handy to have as a reference,
and may also provide useful Google fodder to people searching for these tools.
Standard disclaimer: I use all of these techniques on my Ubuntu 14.04 computer
with gcc 4.8.2, clang 3.4, and gtest 1.6.0, but make no warranty about them
working on other systems. In particular, they're almost certain not to work on
Windows.
Debug output
============
Gumbo has a compile-time switch to dump lots of debug output onto stdout.
Compile with the GUMBO_DEBUG define enabled:
```bash
$ make CFLAGS='-DGUMBO_DEBUG'
```
Note that this spits *a lot* of debug information to the console and makes the
program run significantly slower, so it's usually helpful to isolate only the
specific HTML file or fragment that causes the bug. It lets us trace the
operation of each of the tokenizer & parser's state machines in depth, though.
Unit tests
==========
As mentioned in the README, Gumbo relies on [googletest][] for unit tests.
Unzip the gtest ZIP distribution inside the Gumbo root and rename it 'gtest'.
'make check' runs the tests, as normal.
```bash
$ make check
$ cat test-suite.log
```
If you need to debug a core dump, you'll probably want to run the test binary
directly:
```bash
$ ulimit -c unlimited
$ make check
$ .libs/lt-gumbo_test
$ gdb .libs/lt-gumbo_test core
```
The same goes for core dumps in other example binaries.
To run only a single unit test, pass the --gtest_filter='TestName' flag to the
lt-gumbo_test binary.