Skip to content

Commit

Permalink
Merge pull request #41 from teeshop/feature/utf8
Browse files Browse the repository at this point in the history
Feature/utf8
  • Loading branch information
teeshop authored Jun 7, 2017
2 parents dc9ba1e + 93588b8 commit 4ab1284
Show file tree
Hide file tree
Showing 34 changed files with 541 additions and 640 deletions.
5 changes: 4 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
dist: trusty
sudo: false

language: cpp

compiler:
Expand All @@ -16,7 +19,7 @@ addons:
- python-pip

before_install:
- sudo pip install cpplint
- pip install cpplint

before_script: mkdir -p build
script:
Expand Down
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ set(librexgen_lua "lib${prj}-lua")

message("creating ${prj} ${librexgen_version}")

ADD_DEFINITIONS("-funsigned-char")

if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
MESSAGE(STATUS "USING DEBUG MODE")
ADD_DEFINITIONS("-DREXGEN_DEBUG=0" "-DDEBUG" "-O0" "-g")
Expand Down
21 changes: 10 additions & 11 deletions src/librexgen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ ELSE()
ADD_DEFINITIONS("-std=c++11 -Wall -Wextra -Wshadow -Wpointer-arith -Wcast-qual -Winline")
ENDIF(MSVC)

ADD_DEFINITIONS("-DUNICODE")

FLEX_TARGET(regexscanner parser/regex_lexer.l ${CMAKE_CURRENT_BINARY_DIR}/scanner.cpp COMPILE_FLAGS "")
BISON_TARGET(regexparser parser/regex_parser.y ${CMAKE_CURRENT_BINARY_DIR}/parser.cpp COMPILE_FLAGS "")

Expand Down Expand Up @@ -50,20 +52,15 @@ set(librexgen_sources
iterator/regexalternativesiterator.cpp
iterator/compoundregexiterator.cpp
iterator/groupreferenceiterator.cpp
iterator/classregexiterator.cpp
iterator/streamregexiterator.cpp
iterator/caseiterator.cpp
iterator/terminalregexiterator.cpp
state/serializablestate.cpp
string/simplestring.cpp

parser/rexgenparsercontext.cpp

librexgen.cpp

c/librexgen.cpp
c/iterator.cpp
c/simplestring.cpp
string/simplestring.cpp
parser/rexgenparsercontext.cpp
librexgen.cpp
c/librexgen.cpp
c/iterator.cpp
c/simplestring.cpp
)
set(parser_sources
${BISON_regexparser_OUTPUTS} )
Expand Down Expand Up @@ -109,6 +106,7 @@ SET(INSTALL_INCLUDE_DIR "include/${librexgen}")
file(GLOB librexgen_headers "${CMAKE_CURRENT_SOURCE_DIR}/*.h")
file(GLOB regex_headers "${CMAKE_CURRENT_SOURCE_DIR}/regex/*.h")
file(GLOB iterator_headers "${CMAKE_CURRENT_SOURCE_DIR}/iterator/*.h")
file(GLOB common_headers "${CMAKE_CURRENT_SOURCE_DIR}/common/*.h")
file(GLOB parser_headers "${CMAKE_CURRENT_SOURCE_DIR}/parser/*.h")
file(GLOB string_headers "${CMAKE_CURRENT_SOURCE_DIR}/string/*.h")
file(GLOB state_headers "${CMAKE_CURRENT_SOURCE_DIR}/state/*.h")
Expand All @@ -117,6 +115,7 @@ file(GLOB generated_headers "${CMAKE_CURRENT_BINARY_DIR}/librexgen/*.h")
install(FILES ${librexgen_headers} DESTINATION ${INSTALL_INCLUDE_DIR} )
install(FILES ${regex_headers} DESTINATION ${INSTALL_INCLUDE_DIR}/regex )
install(FILES ${iterator_headers} DESTINATION ${INSTALL_INCLUDE_DIR}/iterator )
install(FILES ${common_headers} DESTINATION ${INSTALL_INCLUDE_DIR}/common )
install(FILES ${parser_headers} DESTINATION ${INSTALL_INCLUDE_DIR}/parser )
install(FILES ${string_headers} DESTINATION ${INSTALL_INCLUDE_DIR}/string )
install(FILES ${state_headers} DESTINATION ${INSTALL_INCLUDE_DIR}/state )
Expand Down
49 changes: 36 additions & 13 deletions src/librexgen/c/simplestring.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

#include <librexgen/c/simplestring.h>
#include <librexgen/string/simplestring.h>
#include <cstdlib>
#include <clocale>

#ifdef __cplusplus
extern "C" {
Expand All @@ -33,24 +35,45 @@ void c_simplestring_delete(c_simplestring_ptr s) {
}

EXPORT
int c_simplestring_to_utf8_string(c_simplestring_ptr s, char* buffer,
size_t buffer_size) {
return (static_cast<SimpleString*>(s))->to_utf8_string(buffer,
buffer_size);
const char* c_simplestring_to_string(c_simplestring_ptr s) {
return (static_cast<SimpleString*>(s))->c_str();
}

EXPORT
int c_simplestring_to_ansi_string(c_simplestring_ptr s, char* buffer,
int c_simplestring_to_utf8_string(c_simplestring_ptr s, char* buffer,
size_t buffer_size) {
return (static_cast<SimpleString*>(s))->to_ansi_string(buffer,
buffer_size);
}
const SimpleString* str = static_cast<SimpleString*>(s);
char tmp_buffer[8];
wchar_t wc;
int result = 0;
char* current_locale = std::setlocale(LC_ALL, NULL);
std::setlocale(LC_ALL, "en_US.UTF-8");

EXPORT
int c_simplestring_to_external_string(c_simplestring_ptr s, char* buffer,
size_t buffer_size) {
return (static_cast<SimpleString*>(s))->to_external_string(buffer,
buffer_size);
const char* ptr = str->data();
const char* end = ptr + str->size();
while (ptr < end) {
int next = std::mblen(ptr, end-ptr);
if (next == -1) {
throw std::runtime_error("mblen(): conversion error");
}

std::mbtowc(&wc, ptr, end-ptr);
const int size = std::wctomb(&tmp_buffer[0], wc);
if (size < 1) {
throw std::runtime_error("wctomb(): conversion error");
}
if (size < (static_cast<int>(buffer_size)-result)) {
memcpy(buffer+result, tmp_buffer, size);
result += size;
} else {
*buffer = 0;
break;
}
ptr += next;
}
std::setlocale(LC_ALL, current_locale);

return result;
}

EXPORT
Expand Down
11 changes: 4 additions & 7 deletions src/librexgen/c/simplestring.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include <librexgen/string/unicode.h>
#include <librexgen/osdepend.h>
#include <librexgen/common/deprecated.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
Expand All @@ -40,15 +41,11 @@ EXPORT
void c_simplestring_delete(c_simplestring_ptr s);

EXPORT
int c_simplestring_to_utf8_string(c_simplestring_ptr s, char* buffer,
size_t buffer_size);
EXPORT
int c_simplestring_to_ansi_string(c_simplestring_ptr s, char* buffer,
size_t buffer_size);
const char* c_simplestring_to_string(c_simplestring_ptr s);

EXPORT
int c_simplestring_to_external_string(c_simplestring_ptr s, char* buffer,
size_t buffer_size);
DEPRECATED(int c_simplestring_to_utf8_string(c_simplestring_ptr s, char* buffer,
size_t buffer_size));

EXPORT
void c_simplestring_clear(c_simplestring_ptr s);
Expand Down
3 changes: 2 additions & 1 deletion src/librexgen/c/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@
* by ANSI-C code
*/
#include <stddef.h>
#include <stdint.h>

typedef void* c_regex_ptr;
typedef size_t (*callback_fp)(wchar_t* dst, const size_t buffer_size);
typedef size_t (*callback_fp)(char* dst, const size_t buffer_size);

#endif /* SRC_LIBREXGEN_C_TYPES_H_ */
1 change: 0 additions & 1 deletion src/librexgen/cmake_install.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unsp
IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/librexgen/unicode" TYPE FILE FILES
"/home/jan/projects/rexgen/trunk/src/librexgen/unicode/simple_string.h"
"/home/jan/projects/rexgen/trunk/src/librexgen/unicode/uchar.h"
"/home/jan/projects/rexgen/trunk/src/librexgen/unicode/utf32.h"
)
ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
Expand Down
33 changes: 33 additions & 0 deletions src/librexgen/common/deprecated.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
rexgen - a tool to create words based on regular expressions
Copyright (C) 2012-2017 Jan Starke <[email protected]>
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St, Fifth Floor, Boston, MA 02110, USA
*/


#ifndef SRC_LIBREXGEN_COMMON_DEPRECATED_H_
#define SRC_LIBREXGEN_COMMON_DEPRECATED_H_

#ifdef __GNUC__
#define DEPRECATED(func) func __attribute__ ((deprecated))
#elif defined(_MSC_VER)
#define DEPRECATED(func) __declspec(deprecated) func
#else
#pragma message("WARNING: You need to implement DEPRECATED for this compiler")
#define DEPRECATED(func) func
#endif

#endif /* SRC_LIBREXGEN_COMMON_DEPRECATED_H_ */
3 changes: 3 additions & 0 deletions src/librexgen/defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ typedef uint8_t charset;
#define CHARSET_UTF16LE 5
#define CHARSET_UTF32LE 6

typedef char byte_t;
typedef uint32_t codepoint_t;

typedef uint8_t uchar_flags_t;
typedef uint8_t uchar_info_t;
typedef wchar_t uchar_codepoint_t;
Expand Down
19 changes: 8 additions & 11 deletions src/librexgen/iterator/caseiterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
#include <librexgen/iterator/caseiterator.h>
#include <librexgen/parser/group_options.h>
#include <librexgen/common/ntz.h>
#include <librexgen/string/uchar.h>
#include <librexgen/genericerror.h>
#include <cassert>

CaseIterator::CaseIterator(Iterator* __child, int options)
: IteratorContainer(-1), child(__child), handle_case(options) {
Expand All @@ -46,11 +46,13 @@ bool CaseIterator::readNextFromChild() {
childHadNext = child->next();
child->value(&word);

for (unsigned int n=0; n < word.size(); ++n) {
if (word.can_change_case(n)) {
word.tolower(n);
changeable_characters.push_back(n);
/* store the indices of all convertible characters */
for (unsigned int idx=0; idx < word.size();) {
if (word.can_change_case(idx)) {
changeable_characters.push_back(idx);
}

idx += word.character_length(idx);
}

if (changeable_characters.size() <= max_fast_character_bytes) {
Expand All @@ -62,11 +64,6 @@ bool CaseIterator::readNextFromChild() {
parity = 0;
j = 0; /* == ntz(k) & parity */

/* delete UCHAR_FLAGS_CAN_CHANGE_CASE for all characters */
if (handle_case == CASE_PRESERVE) {
word.set_preserve_case();
}

return childHadNext;
}

Expand All @@ -81,7 +78,7 @@ bool CaseIterator::hasNext() const {
* of Donald Ervin Knuth, found in TAOCP, 7.2.1.1 */
bool CaseIterator::next() {
/* G1 */
if (word.empty() || k == 0) {
if (word.empty() || k == 0 || handle_case == CASE_IGNORE) {
bool childHadNext = readNextFromChild();

/* keep in mind: k is the number of remaining variants */
Expand Down
5 changes: 2 additions & 3 deletions src/librexgen/iterator/caseiterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#define SRC_LIBREXGEN_ITERATOR_CASEITERATOR_H_

#include <librexgen/iterator/iteratorcontainer.h>
#include <librexgen/string/uchar.h>
#include <librexgen/string/simplestring.h>
#include <cinttypes>
#include <vector>
Expand Down Expand Up @@ -56,7 +55,6 @@ class CaseIterator : public IteratorContainer {
Iterator* child;
int handle_case;
bool readNextFromChild();
SimpleString word;

#if __x86_64__
typedef std::uint64_t counter_t;
Expand All @@ -69,7 +67,8 @@ class CaseIterator : public IteratorContainer {
counter_t k;
unsigned int j;
unsigned int parity;
std::vector<int> changeable_characters;
std::vector<unsigned int> changeable_characters;
SimpleString word;
};

#endif /* __cplusplus */
Expand Down
51 changes: 0 additions & 51 deletions src/librexgen/iterator/classregexiterator.cpp

This file was deleted.

Loading

0 comments on commit 4ab1284

Please sign in to comment.