/***********************************************************************************

    Copyright (C) 2007-2020 Ahmet Öztürk (aoz_2@yahoo.com)

    This file is part of Lifeograph.

    Lifeograph is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    Lifeograph is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Lifeograph.  If not, see <http://www.gnu.org/licenses/>.

***********************************************************************************/


#include "parser_text.hpp"

#include "diarydata.hpp"


using namespace LIFEO;

// TEXT FORMATTING
const ParserText::Recipe::Contents
    ParserText::m_rc_subheading =
    { Ch_NEWLINE,
      { Ch_SPACE, &ParserText::set_start },
      { CFC_ANY_BUT_NEWLINE, &ParserText::junction_subheading } },

    ParserText::m_rc_markup =
    { CFC_BLANK|CF_PARENTHESIS,
      { CF_MARKUP, &ParserText::junction_markup } },

    ParserText::m_rc_markup_b_end =
    { { CFC_ANY_BUT_NEWLINE|CM_MULTIPLE, &ParserText::junction_markup2 },
      { Ch_ASTERISK, &ParserText::apply_bold } },

    ParserText::m_rc_markup_i_end =
    { { CFC_ANY_BUT_NEWLINE|CM_MULTIPLE, &ParserText::junction_markup2 },
      { Ch_UNDERSCORE, &ParserText::apply_italic } },

    ParserText::m_rc_markup_h_end =
    { { CFC_ANY_BUT_NEWLINE|CM_MULTIPLE, &ParserText::junction_markup2 },
      { Ch_HASH, &ParserText::apply_highlight } },

    ParserText::m_rc_markup_s_end =
    { { CFC_ANY_BUT_NEWLINE|CM_MULTIPLE, &ParserText::junction_markup2 },
      { Ch_EQUALS, &ParserText::apply_strikethrough } },

    ParserText::m_rc_comment =
    { { Ch_SBB, &ParserText::set_start },
      Ch_SBB,
      CFC_ANY_BUT_NEWLINE|CM_MULTIPLE,
      Ch_SBE,
      { Ch_SBE, &ParserText::apply_comment } },

    ParserText::m_rc_ignore =
    { Ch_NEWLINE,
      { Ch_DOT, &ParserText::set_start },
      Ch_TAB,
      { CFC_ANY_BUT_NEWLINE|CM_MULTIPLE, &ParserText::add_block },
      { Ch_NEWLINE, &ParserText::apply_ignore } },

    ParserText::m_rc_todo =
    { Ch_NEWLINE,
      Ch_TAB|CM_MULTIPLE,
      { Ch_SBB, &ParserText::set_start },
      CF_TODO_STATUS,
      Ch_SBE,
      { Ch_SPACE, &ParserText::junction_todo } },

    ParserText::m_rc_indent =
    { { Ch_NEWLINE, &ParserText::set_start },
      Ch_TAB|CM_MULTIPLE,
      { CFC_NONSPACE, &ParserText::apply_indent } },

    ParserText::m_rc_tag =
    { CFC_BLANK|CF_PARENTHESIS,
      { Ch_COLON, &ParserText::set_start },
      CFC_ANY_BUT_NEWLINE|CM_MULTIPLE,
      Ch_COLON,
      { CFC_ANY, &ParserText::junction_tag }, // equal sign
      { CF_NUMERIC|CM_MULTIPLE, &ParserText::junction_number },
      { CFC_ANY_BUT_NUMERIC|CF_VALUE_SEPARATOR, &ParserText::junction_tag2 }, // slash
      { CF_NUMERIC|CM_MULTIPLE, &ParserText::junction_number },
      { CFC_ANY_BUT_NUMERIC, &ParserText::apply_inline_tag }
    };

// LINK
const ParserText::Recipe::Contents
    ParserText::m_rc_date =
    { CFC_BLANK|CF_PUNCTUATION,
      { CF_DIGIT, &ParserText::set_start },
      CF_DIGIT,
      CF_DIGIT,
      CF_DIGIT,
      { CF_DATE_SEPARATOR, &ParserText::junction_date_dotym },
      CF_DIGIT,
      CF_DIGIT,
      { CF_DATE_SEPARATOR, &ParserText::junction_date_dotmd },
      CF_DIGIT,
      { CF_DIGIT, &ParserText::check_date } },

    ParserText::m_rc_colon =
    { { Ch_COLON, &ParserText::junction_colon } },

    ParserText::m_rc_at_email =
    { { Ch_AT, &ParserText::junction_at },
      CFC_NONSPACE|CM_MULTIPLE,
      Ch_DOT,
      CFC_NONSPACE|CM_MULTIPLE,
      { CFC_BLANK, &ParserText::apply_link } },

    ParserText::m_rc_link_file =
    { Ch_SLASH,
      Ch_SLASH,
      CF_SPACE_CONDTNL|CFC_NONSPACE|CM_MULTIPLE,
      { CF_SPACE_CONDTNL|CFC_BLANK, &ParserText::junction_link } },

    ParserText::m_rc_link_email =
    { CFC_NONSPACE|CM_MULTIPLE,
      Ch_AT,
      CFC_NONSPACE|CM_MULTIPLE,
      Ch_DOT,
      CFC_NONSPACE|CM_MULTIPLE,
      { CFC_BLANK, &ParserText::junction_link } },

    ParserText::m_rc_link_geo =
    { CFC_NONSPACE|CM_MULTIPLE,
      { CFC_BLANK, &ParserText::junction_link } },

    ParserText::m_rc_link_id =
    { CF_DIGIT,
      CF_DIGIT,
      CF_DIGIT,
      CF_DIGIT|CM_MULTIPLE,
      { CFC_BLANK, &ParserText::junction_link } },

    ParserText::m_rc_chart =
    { CFC_ANY_BUT_NEWLINE|CM_MULTIPLE,
      { Ch_NEWLINE, &ParserText::apply_chart } },

    ParserText::m_rc_link_hidden_end =
    { { Ch_TAB, &ParserText::set_middle },
      CFC_ANY_BUT_NEWLINE|CM_MULTIPLE,
      { Ch_MORE, &ParserText::apply_link_hidden } };

// TEXT PARSER =====================================================================================
ParserText::ParserText()
{
    m_all_recipes.insert( new Recipe{ RID_SUBHEADING, this, &m_rc_subheading, 0 } );
    m_all_recipes.insert( new Recipe{ RID_MARKUP, this, &m_rc_markup, 0 } );
    m_all_recipes.insert( new Recipe{ RID_COMMENT, this, &m_rc_comment, 0 } );
    m_all_recipes.insert( new Recipe{ RID_IGNORE, this, &m_rc_ignore, RID_ALL } );
    m_all_recipes.insert( new Recipe{ RID_TODO, this, &m_rc_todo, 0 } );
    m_all_recipes.insert( new Recipe{ RID_DATE, this, &m_rc_date, 0 } );
    m_all_recipes.insert( new Recipe{ RID_COLON, this, &m_rc_colon, 0 } );
    m_all_recipes.insert( new Recipe{ RID_LINK_AT, this, &m_rc_at_email, 0 } );
    m_all_recipes.insert( new Recipe{ RID_GENERIC, this, &m_rc_indent, 0 } );
    m_all_recipes.insert( new Recipe{ RID_GENERIC, this, &m_rc_tag, 0 } );

    m_link_protocols.emplace( "deid", new LinkProtocol( RID_ID, &m_rc_link_id ) );
    m_link_protocols.emplace( "file", new LinkProtocol( RID_URI, &m_rc_link_file ) );
    m_link_protocols.emplace( "ftp", new LinkProtocol( RID_URI, &m_rc_link_file ) );
    m_link_protocols.emplace( "geo", new LinkProtocol( RID_URI, &m_rc_link_geo ) );
    m_link_protocols.emplace( "http", new LinkProtocol( RID_URI, &m_rc_link_file ) );
    m_link_protocols.emplace( "https", new LinkProtocol( RID_URI, &m_rc_link_file ) );
    m_link_protocols.emplace( "mailto", new LinkProtocol( RID_URI, &m_rc_link_email ) );
    m_link_protocols.emplace( "rel", new LinkProtocol( RID_URI, &m_rc_link_file ) );
    m_link_protocols.emplace( "chart", new LinkProtocol( RID_CHART, &m_rc_chart ) );
}
ParserText::~ParserText()
{
    for( auto recipe : m_all_recipes )
        delete recipe;

    for( auto& kv_protocol : m_link_protocols )
        delete kv_protocol.second;
}

void
ParserText::reset( UstringSize bgn, UstringSize end )
{
    m_pos_end = end;
    m_pos_cur = m_pos_blank = m_pos_para_bgn = m_pos_extra_1 = m_pos_extra_2 = bgn;

    m_cf_curr = CF_NOTHING;
    m_cf_last = Ch_NOT_SET;
    m_word_cur.clear();
    m_word_count = 0;
    m_int_last = 0;
    m_date_last = 0;

    for( auto r : m_active_recipes )
        delete r;
    m_active_recipes.clear();

    // start as if previous char is a new line
    for( Recipe* r : m_all_recipes )
    {
        r->m_index = 0;
        r->m_state = Recipe::RS_NOT_SET;
        if( r->process_char() == Recipe::RS_IN_PROGRESS )
            m_active_recipes.push_back( new Recipe( r ) );
    }
}

void
ParserText::set_search_str( const Ustring& str )
{
    m_search_str = str;
    i_search = 0;
    i_search_end = str.size() - 1;
}

void
ParserText::parse( const UstringSize bgn, const UstringSize end )
{
    reset( bgn, end );

    if( bgn == 0 )
        apply_heading();

    if( bgn == end ) // zero length
        return;

    for( ; m_pos_cur < m_pos_end; ++m_pos_cur )
    {
        m_char_last = m_char_cur;
        m_char_cur = get_char_at( m_pos_cur );

        if( !m_search_str.empty() )
        {
            if( m_search_str[ i_search ] == char_lower( m_char_cur ) )
            {
                if( i_search == 0 )
                    m_pos_search = m_pos_cur;
                if( i_search == i_search_end )
                {
                    apply_match();
                    i_search = 0;
                }
                else
                    i_search++;
            }
            else
                i_search = 0;
        }

        // MARKUP PARSING
        switch( m_char_cur )
        {
            case 0:     // should never be the case
            case '\n':
            case '\r':
                m_cf_curr = Ch_NEWLINE|CF_NEWLINE;
                process_char();
                if( m_pos_cur > bgn ) // skip the \n at the start of the parsing region
                    process_paragraph();
                m_pos_para_bgn = m_pos_cur + 1;
                continue;   // !!!!! CONTINUES TO SKIP process_char() BELOW !!!!!
            case ' ':
                m_cf_curr = Ch_SPACE|CF_SPACE|CF_TODO_STATUS|CF_NUMERIC;
                break;
            case '*': // SIGN
                m_cf_curr = Ch_ASTERISK|CF_PUNCTUATION|CF_MARKUP|CF_NONNUMERIC_PUNCT;
                break;
            case '_': // SIGN
                m_cf_curr = Ch_UNDERSCORE|CF_PUNCTUATION|CF_MARKUP|CF_NONNUMERIC_PUNCT;
                break;
            case '=': // SIGN
                m_cf_curr = Ch_EQUALS|CF_PUNCTUATION|CF_MARKUP|CF_NONNUMERIC_PUNCT;
                break;
            case '#': // SIGN
                m_cf_curr = Ch_HASH|CF_PUNCTUATION|CF_MARKUP|CF_NONNUMERIC_PUNCT;
                break;
            case '[': // SIGN
                m_cf_curr = Ch_SBB|CF_PUNCTUATION|CF_NONNUMERIC_PUNCT|CF_PARENTHESIS;
                break;
            case ']': // SIGN
                m_cf_curr = Ch_SBE|CF_PUNCTUATION|CF_NONNUMERIC_PUNCT|CF_PARENTHESIS;
                break;
            case '(': case ')':
            case '{': case '}': // parentheses
                m_cf_curr = CF_PUNCTUATION|CF_NONNUMERIC_PUNCT|CF_PARENTHESIS;
                break;
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7': case '8': case '9':
                m_cf_curr = CF_DIGIT|CF_NUMERIC;
                process_number();   // calculates numeric value
                break;
            case '.': // SIGN
                m_cf_curr = Ch_DOT|CF_PUNCTUATION|CF_DATE_SEPARATOR|CF_NUMERIC;
                break;
            case ',': // SIGN
                m_cf_curr = Ch_COMMA|CF_PUNCTUATION|CF_NUMERIC;
                break;
            case '-': // SIGN - CF_SIGNSPELL does not seem to be necessary
                m_cf_curr = Ch_DASH|CF_PUNCTUATION|CF_DATE_SEPARATOR|CF_NUMERIC;
                break;
            case '/': // SIGN
                m_cf_curr = Ch_SLASH|CF_PUNCTUATION|CF_DATE_SEPARATOR|
                                     CF_VALUE_SEPARATOR|CF_NONNUMERIC_PUNCT;
                break;
            case ':': // SIGN
                m_cf_curr = Ch_COLON|CF_PUNCTUATION|CF_NONNUMERIC_PUNCT;
                break;
            case '@': // SIGN
                m_cf_curr = Ch_AT|CF_PUNCTUATION|CF_NONNUMERIC_PUNCT;
                break;
            case '<': // SIGN
                m_cf_curr = Ch_LESS|CF_PUNCTUATION|CF_NONNUMERIC_PUNCT;
                break;
            case '>': // SIGN
                m_cf_curr = Ch_MORE|CF_PUNCTUATION|CF_TODO_STATUS|CF_NONNUMERIC_PUNCT;
                break;
            case '\t':
                m_cf_curr = Ch_TAB|CF_TAB;
                break;
            // LIST CHARS
            case '~':
                m_cf_curr = Ch_TILDE|CF_PUNCTUATION|CF_TODO_STATUS|CF_NONNUMERIC_PUNCT;
                break;
            case '+':
                m_cf_curr = Ch_PLUS|CF_PUNCTUATION|CF_TODO_STATUS|CF_NUMERIC;
                break;
            case 'x':
            case 'X':
                m_cf_curr = Ch_X|CF_ALPHA|CF_SPELLCHECK|CF_TODO_STATUS|CF_NONNUMERIC_PUNCT;
                break;
            case '\'':
                m_cf_curr = CF_PUNCTUATION|CF_SPELLCHECK|CF_NONNUMERIC_PUNCT;
                break;
            default:
                m_cf_curr = is_char_alpha( m_char_cur ) ? CF_ALPHA|CF_SPELLCHECK :
                                                          CF_PUNCTUATION|CF_NONNUMERIC_PUNCT;
                break;
        }
        process_char();
    }
    // end of the text -treated like a new line for all means and purposes
    if( m_pos_end > 0 ) // only when finish is not forced
    {
        m_char_last = m_char_cur;
        m_char_cur = '\n';
        m_cf_curr = Ch_NEWLINE|CF_NEWLINE;
        process_char();
        process_paragraph();
    }
}

ParserText::Recipe::State
ParserText::Recipe::process_char()
{
    if( m_parent->m_blocked_flags & m_id )
        return( m_state = RS_REJECTED );

    if( cmp_chars( get_char_class_at( m_index ), m_parent->m_cf_curr ) )
    {
        if( m_contents->at( m_index ).applier )
        {
            m_parent->m_recipe_cur = this;
            // applier may set a value for m_state:
            ( m_parent->*m_contents->at( m_index ).applier )();
        }

        if( !( m_state & RS_IN_PROGRESS ) )
            m_state = ( m_state & RS_BLOCK ) | RS_IN_PROGRESS;

        m_index++;
    }
    else
    if( m_index == 0 ||
        !( m_contents->at( m_index - 1 ).flags & CM_MULTIPLE ) ||
        !cmp_chars( get_char_class_at( m_index - 1 ), m_parent->m_cf_curr ) )
        m_state = RS_REJECTED;
    else
    if( m_contents->at( m_index - 1 ).applier ) // multiply occurring chars can have appliers, too
    {
        m_parent->m_recipe_cur = this;
        // applier may set a value for m_state:
        ( m_parent->*m_contents->at( m_index - 1 ).applier )();
    }

    if( m_index == m_contents->size() )
        m_state = ( m_state & RS_BLOCK ) | RS_ACCEPTED;

    return m_state;
}

inline void
ParserText::process_char()
{
    m_blocked_flags = 0;

    // UPDATE WORD LAST
    if( m_cf_curr & CF_SPELLCHECK )
    {
        if( not( m_cf_last & CF_SPELLCHECK ) )
        {
            m_word_cur.clear();
            m_word_count++;
        }

        m_word_cur += m_char_cur;
    }
    else
    {
        if( m_cf_curr & CFC_BLANK )
            m_pos_blank = m_pos_cur;
        if( m_flag_check_word && not( m_word_cur.empty() ) && ( m_cf_last & CF_SPELLCHECK ) )
            check_word();
    }

    // FIRST CHECK ACTIVE RECIPES
    for( auto&& it = m_active_recipes.begin(); it != m_active_recipes.end(); )
    {
        Recipe* r{ *it };
        if( !( r->process_char() & Recipe::RS_IN_PROGRESS ) )
        {
            it = m_active_recipes.erase( it );
            delete r;
        }
        else
        {
            it++;
            if( r->m_state & Recipe::RS_BLOCK ) m_blocked_flags |= r->m_blocks;
        }
    }

    // THEN CHECK IF IT TRIGGERS ANY OTHER RECIPE
    for( Recipe* r : m_all_recipes )
    {
        r->m_index = 0;
        r->m_state = Recipe::RS_NOT_SET;
        if( r->process_char() == Recipe::RS_IN_PROGRESS )
            m_active_recipes.push_back( new Recipe( r ) );
    }

    m_cf_last = m_cf_curr;
}

void
ParserText::add_link_protocol( const std::string& name, Recipe::Id id, Recipe::Contents* rc )
{
    m_link_protocols.emplace( name, new LinkProtocol( id, rc ) );
}

// JUNCTIONS =======================================================================================
void
ParserText::check_date()
{
    m_date_last.set_day( m_int_last );

    if( m_date_last.is_valid() )
        apply_link();
}

void
ParserText::junction_subheading()
{
    if( m_char_cur == ' ' )
        apply_subsubheading();
    else
        apply_subheading();
}

void
ParserText::junction_markup()
{
    set_start();
    m_recipe_cur->m_index = 0;    // as it will be ++
    m_recipe_cur->m_state |= Recipe::RS_BLOCK;

    switch( m_char_cur )
    {
        case '*':
            m_recipe_cur->m_id = RID_BOLD;
            m_recipe_cur->m_blocks = RID_BOLD;
            m_recipe_cur->m_contents = &m_rc_markup_b_end;
            break;
        case '_':
            m_recipe_cur->m_id = RID_ITALIC;
            m_recipe_cur->m_blocks = RID_ITALIC;
            m_recipe_cur->m_contents = &m_rc_markup_i_end;
            break;
        case '#':
            m_recipe_cur->m_id = RID_HIGHLIGHT;
            m_recipe_cur->m_blocks = RID_HIGHLIGHT;
            m_recipe_cur->m_contents = &m_rc_markup_h_end;
            break;
        case '=':
            m_recipe_cur->m_id = RID_STRIKETHROUGH;
            m_recipe_cur->m_blocks = RID_STRIKETHROUGH;
            m_recipe_cur->m_contents = &m_rc_markup_s_end;
            break;
    }
}

void
ParserText::junction_markup2()
{
    switch( m_recipe_cur->m_id )
    {
        case RID_BOLD:
            m_recipe_cur->m_id = RID_MARKUP_B_END;
            break;
        case RID_ITALIC:
            m_recipe_cur->m_id = RID_MARKUP_I_END;
            break;
        case RID_HIGHLIGHT:
            m_recipe_cur->m_id = RID_MARKUP_H_END;
            break;
        case RID_STRIKETHROUGH:
            m_recipe_cur->m_id = RID_MARKUP_S_END;
            break;
    }
}

void
ParserText::junction_todo()
{
    switch( get_char_at( m_pos_cur - 2 ) )
    {
        case ' ':
            apply_check_unf();
            break;
        case '~':
            apply_check_prg();
            break;
        case '+':
            apply_check_fin();
            break;
        case 'x':
        case 'X':
        case '>': // extra sign for distinguishing deferred items
            apply_check_ccl();
            break;
        default:
            break;
    }
}

void
ParserText::junction_date_dotym()
{
    if( m_int_last >= Date::YEAR_MIN && m_int_last <= Date::YEAR_MAX )
        m_date_last.set_year( m_int_last );
    else
        m_recipe_cur->m_state = Recipe::RS_REJECTED;
}

void
ParserText::junction_date_dotmd()
{
    if( m_int_last >= 1 && m_int_last <= 12 &&
        // two separators must be the same:
        get_char_at( m_pos_cur - 3 ) == m_char_cur )
    {
        m_date_last.set_month( m_int_last );
    }
    else
        m_recipe_cur->m_state = Recipe::RS_REJECTED;
}

void
ParserText::junction_colon()
{
    auto kv_protocol = m_link_protocols.find( m_word_cur );
    if( kv_protocol != m_link_protocols.end() )
    {
        m_active_recipes.push_back(
            new Recipe{ kv_protocol->second->type, this, kv_protocol->second->rc, RID_LINK_AT,
                        m_pos_cur - m_word_cur.length(), m_pos_cur + 1 } );
                        // pos_middle is only used by chart

        if( m_pos_cur - m_word_cur.length() > 0 &&
            get_char_at( m_pos_cur - m_word_cur.length() - 1 ) == '<' )
            m_active_recipes.back()->m_flag_accept_spaces = true;

        m_active_recipes.back()->m_state |= Recipe::RS_BLOCK;
    }
}

void
ParserText::junction_at()
{
    m_recipe_cur->m_pos_bgn = m_pos_blank + 1;
}

void
ParserText::junction_link()
{
    //if( m_recipe_cur->m_pos_bgn > 0 && get_char_at( m_recipe_cur->m_pos_bgn - 1 ) == '<' )
    if( m_recipe_cur->m_flag_accept_spaces )
    {
        m_active_recipes.push_back(
                new Recipe{ m_recipe_cur->m_id, this, &m_rc_link_hidden_end, 0,
                            m_recipe_cur->m_pos_bgn - 1, m_pos_cur } );
    }
    else
    {
        if( m_recipe_cur->m_id == RID_ID )
            m_recipe_cur->m_int_value = m_int_last;
        apply_link();
    }
}

void
ParserText::junction_number()
{
    // this is used to disregard the spaces which can be used in numbers...
    // ...as thousands separator per ISO 31-0 standard
    if( m_char_cur != ' ' )
        m_pos_extra_2 = m_pos_cur;
}

void
ParserText::junction_tag()
{
    apply_inline_tag();
    if( m_char_cur == '=' )
        m_recipe_cur->m_pos_mid = m_pos_cur;
    else // do not continue with this recipe:
        m_recipe_cur->m_index = ( m_recipe_cur->m_contents->size() - 1 );
}

void
ParserText::junction_tag2()
{
    if( m_char_cur == '/' )
        m_pos_extra_1 = m_pos_cur;
    else
    {
        apply_inline_tag();
        // do not continue with this recipe:
        m_recipe_cur->m_index = ( m_recipe_cur->m_contents->size() - 1 );
    }
}

// HELPERS =========================================================================================
inline void
ParserText::set_start()
{
    m_recipe_cur->m_pos_bgn = m_pos_cur;
}

inline void
ParserText::set_middle()
{
    m_recipe_cur->m_pos_mid = m_pos_cur;
}

inline void
ParserText::add_block()
{
    m_recipe_cur->m_state |= Recipe::RS_BLOCK;
}

inline void
ParserText::process_number()
{
    if( m_cf_last & CF_DIGIT )
    {
        m_int_last *= 10;
        m_int_last += ( m_char_cur - '0' );
    }
    else
        m_int_last = ( m_char_cur - '0' );
}
