/*
 * Small jpeg decoder library
 *
 * Copyright (c) 2006, Luc Saillard <luc@saillard.org>
 * All rights reserved.
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * - Redistributions of source code must retain the above copyright notice,
 *  this list of conditions and the following disclaimer.
 *
 * - Redistributions in binary form must reproduce the above copyright notice,
 *  this list of conditions and the following disclaimer in the documentation
 *  and/or other materials provided with the distribution.
 *
 * - Neither the name of the author nor the names of its contributors may be
 *  used to endorse or promote products derived from this software without
 *  specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <errno.h>

#include "tinyjpeg.h"
#include "tinyjpeg-internal.h"

/* Global variable to return the last error found while deconding */
char error_string[256];

static const unsigned char zigzag[64] = 
{
   0,  1,  5,  6, 14, 15, 27, 28,
   2,  4,  7, 13, 16, 26, 29, 42,
   3,  8, 12, 17, 25, 30, 41, 43,
   9, 11, 18, 24, 31, 40, 44, 53,
  10, 19, 23, 32, 39, 45, 52, 54,
  20, 22, 33, 38, 46, 51, 55, 60,
  21, 34, 37, 47, 50, 56, 59, 61,
  35, 36, 48, 49, 57, 58, 62, 63
};

/*
 * 4 functions to manage the stream
 *
 *  fill_nbits: put at least nbits in the reservoir of bits.
 *              But convert any 0xff,0x00 into 0xff
 *  get_nbits: read nbits from the stream, and put it in result,
 *             bits is removed from the stream and the reservoir is filled
 *             automaticaly. The result is signed according to the number of
 *             bits.
 *  look_nbits: read nbits from the stream without marking as read.
 *  skip_nbits: read nbits from the stream but do not return the result.
 * 
 * stream: current pointer in the jpeg data (read bytes per bytes)
 * nbits_in_reservoir: number of bits filled into the reservoir
 * reservoir: register that contains bits information. Only nbits_in_reservoir
 *            is valid.
 *                          nbits_in_reservoir
 *                        <--    17 bits    -->
 *            Ex: 0000 0000 1010 0000 1111 0000   <== reservoir
 *                        ^
 *                        bit 1
 *            To get two bits from this example
 *                 result = (reservoir >> 15) & 3
 *
 */

#define fill_nbits(reservoir,nbits_in_reservoir,stream,nbits_wanted) do { \
   while (nbits_in_reservoir<nbits_wanted) \
    { \
      unsigned char c; \
      if (stream >= priv->stream_end) \
        return -1; \
      c = *stream++; \
      reservoir <<= 8; \
      if (c == 0xff && *stream == 0x00) \
        stream++; \
      reservoir |= c; \
      nbits_in_reservoir+=8; \
    } \
}  while(0);


/* Signed version !!!! */
#define get_nbits(reservoir,nbits_in_reservoir,stream,nbits_wanted,result) do { \
   fill_nbits(reservoir,nbits_in_reservoir,stream,(nbits_wanted)); \
   result = ((reservoir)>>(nbits_in_reservoir-(nbits_wanted))); \
   nbits_in_reservoir -= (nbits_wanted);  \
   reservoir &= ((1U<<nbits_in_reservoir)-1); \
   if ((unsigned int)result < (1UL<<((nbits_wanted)-1))) \
       result += (0xFFFFFFFFUL<<(nbits_wanted))+1; \
}  while(0);

#define look_nbits(reservoir,nbits_in_reservoir,stream,nbits_wanted,result) do { \
   fill_nbits(reservoir,nbits_in_reservoir,stream,(nbits_wanted)); \
   result = ((reservoir)>>(nbits_in_reservoir-(nbits_wanted))); \
}  while(0);

/* To speed up the decoding, we assume that the reservoir has enough bits */
#define skip_nbits(reservoir,nbits_in_reservoir,stream,nbits_wanted) do { \
   nbits_in_reservoir -= (nbits_wanted); \
   reservoir &= ((1U<<nbits_in_reservoir)-1); \
}  while(0);

static void resync(struct jdec_private *priv);

/**
 * Get the next (valid) huffman code in the stream.
 *
 * To speedup the procedure, we look HUFFMAN_HASH_NBITS bits and the code is
 * lower than HUFFMAN_HASH_NBITS we have automaticaly the length of the code
 * and the value by using two lookup table.
 * Else if the value is not found, just search (linear) into an array for each
 * bits is the code is present.
 *
 * If the code is not present for any reason, -1 is return.
 */
static int get_next_huffman_code(struct jdec_private *priv, struct huffman_table *huffman_table)
{
	int value, hcode;
	unsigned int extra_nbits, nbits;
	uint16_t *slowtable;

	look_nbits(priv->reservoir, priv->nbits_in_reservoir, priv->stream, HUFFMAN_HASH_NBITS, hcode);
	value = huffman_table->lookup[hcode];
	if (__likely(value >= 0))
	{
		unsigned int code_size = huffman_table->code_size[value];
		skip_nbits(priv->reservoir, priv->nbits_in_reservoir, priv->stream, code_size);
		return value;
	}

	/* Decode more bits each time ... */
	for (extra_nbits=0; extra_nbits<16-HUFFMAN_HASH_NBITS; extra_nbits++)
	{
		nbits = HUFFMAN_HASH_NBITS + 1 + extra_nbits;

		look_nbits(priv->reservoir, priv->nbits_in_reservoir, priv->stream, nbits, hcode);
		slowtable = huffman_table->slowtable[extra_nbits];
		/* Search if the code is in this array */
		while (slowtable[0]) {
			if (slowtable[0] == hcode) {
				skip_nbits(priv->reservoir, priv->nbits_in_reservoir, priv->stream, nbits);
				return slowtable[1];
			}
			slowtable+=2;
		}
	}
	return 0;
}

/**
 *
 * Decode a single block that contains the DCT coefficients.
 * The table coefficients is already dezigzaged at the end of the operation.
 *
 */
static int process_Huffman_data_unit(struct jdec_private *priv, int component)
{
	unsigned char j;
	unsigned int huff_code;
	int retcode;
	unsigned char size_val, count_0;

	struct component *c = &priv->component_infos[component];
	short int DCT[64];

	/* Initialize the DCT coef table */
	memset(DCT, 0, sizeof(DCT));

	/* DC coefficient decoding */
	retcode = get_next_huffman_code(priv, c->DC_table);
	// End of stream
	if(retcode == -1)
		return -1;
	else
		huff_code = (unsigned int)retcode;
	//trace("+ %x\n", huff_code);
	if (huff_code) {
		get_nbits(priv->reservoir, priv->nbits_in_reservoir, priv->stream, huff_code, DCT[0]);
		DCT[0] += c->previous_DC;
		c->previous_DC = DCT[0];
	} else {
		DCT[0] = c->previous_DC;
	}

	/* AC coefficient decoding */
	j = 1;
	while (j<64)
	{
		huff_code = get_next_huffman_code(priv, c->AC_table);
		//trace("- %x\n", huff_code);

		size_val = huff_code & 0xF;
		count_0 = huff_code >> 4;

		if (size_val == 0)
		{ /* RLE */
		if (count_0 == 0)
			break;	/* EOB found, go out */
			else if (count_0 == 0xF)
				j += 16;	/* skip 16 zeros */
		}
		else
		{
			j += count_0;	/* skip count_0 zeroes */
			if (__unlikely(j >= 64))
			{
				snprintf(error_string, sizeof(error_string), "Bad huffman data (buffer overflow)");
				break;
			}
			get_nbits(priv->reservoir, priv->nbits_in_reservoir, priv->stream, size_val, DCT[j]);
			j++;
		}
	}

	for (j = 0; j < 64; j++)
		c->DCT[j] = DCT[zigzag[j]];
	return 0;
}

/*******************************************************************************
 *
 * Colorspace conversion routine
 *
 * Note:
 * YCbCr is defined per CCIR 601-1, except that Cb and Cr are
 * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5.
 * The conversion equations to be implemented are therefore
 *      R = Y                + 1.40200 * Cr
 *      G = Y - 0.34414 * Cb - 0.71414 * Cr
 *      B = Y + 1.77200 * Cb
 * 
 ******************************************************************************/

static unsigned char clamp(int i)
{
	if (i<0)
		return 0;
	else if (i>255)
		return 255;
	else
		return i;
}   

#define SCALEBITS       10
#define ONE_HALF        (1UL << (SCALEBITS-1))
#define FIX(x)          ((int)((x) * (1UL<<SCALEBITS) + 0.5))

/**
 *  YCrCb -> RGB24 (2x2)
 *  .-------.
 *  | 1 | 2 |
 *  |---+---|
 *  | 3 | 4 |
 *  `-------'
 */
static void YCrCB_to_RGB24_2x2(struct jdec_private *priv)
{
	const unsigned char *Y, *Cb, *Cr;
	unsigned char *p, *p2;
	int i,j;
	int offset_to_next_row;

	p = priv->plane;
	p2 = priv->plane + priv->width*3;
	Y = priv->Y;
	Cb = priv->Cb;
	Cr = priv->Cr;
	offset_to_next_row = (priv->width*3*2) - 16*3;
	for (i=0; i<8; i++) {

		for (j=0; j<8; j++) {

			int y, cb, cr;
			int add_r, add_g, add_b;
			int r, g , b;

			cb = *Cb++ - 128;
			cr = *Cr++ - 128;
			add_r = FIX(1.40200) * cr + ONE_HALF;
			add_g = - FIX(0.34414) * cb - FIX(0.71414) * cr + ONE_HALF;
			add_b = FIX(1.77200) * cb + ONE_HALF;

			y  = (*Y++) << SCALEBITS;
			r = (y + add_r) >> SCALEBITS;
			*p++ = clamp(r);
			g = (y + add_g) >> SCALEBITS;
			*p++ = clamp(g);
			b = (y + add_b) >> SCALEBITS;
			*p++ = clamp(b);

			y  = (*Y++) << SCALEBITS;
			r = (y + add_r) >> SCALEBITS;
			*p++ = clamp(r);
			g = (y + add_g) >> SCALEBITS;
			*p++ = clamp(g);
			b = (y + add_b) >> SCALEBITS;
			*p++ = clamp(b);

			y  = (Y[16-2]) << SCALEBITS;
			r = (y + add_r) >> SCALEBITS;
			*p2++ = clamp(r);
			g = (y + add_g) >> SCALEBITS;
			*p2++ = clamp(g);
			b = (y + add_b) >> SCALEBITS;
			*p2++ = clamp(b);

			y  = (Y[16-1]) << SCALEBITS;
			r = (y + add_r) >> SCALEBITS;
			*p2++ = clamp(r);
			g = (y + add_g) >> SCALEBITS;
			*p2++ = clamp(g);
			b = (y + add_b) >> SCALEBITS;
			*p2++ = clamp(b);
		}
		Y  += 16;
		p  += offset_to_next_row;
		p2 += offset_to_next_row;
	}
}

/*
 * Decode a 2x2
 *  .-------.
 *  | 1 | 2 |
 *  |---+---|
 *  | 3 | 4 |
 *  `-------'
 */
static int decode_MCU_2x2_3planes(struct jdec_private *priv)
{
	// Y
	if(process_Huffman_data_unit(priv, cY))
		return -1;
	IDCT(&priv->component_infos[cY], priv->Y, 16);
	if(process_Huffman_data_unit(priv, cY))
		return -1;
	IDCT(&priv->component_infos[cY], priv->Y+8, 16);
	if(process_Huffman_data_unit(priv, cY))
		return -1;
	IDCT(&priv->component_infos[cY], priv->Y+64*2, 16);
	if(process_Huffman_data_unit(priv, cY))
		return -1;
	IDCT(&priv->component_infos[cY], priv->Y+64*2+8, 16);

	// Cb
	if(process_Huffman_data_unit(priv, cCb))
		return -1;
	IDCT(&priv->component_infos[cCb], priv->Cb, 8);

	// Cr
	if(process_Huffman_data_unit(priv, cCr))
		return -1;
	IDCT(&priv->component_infos[cCr], priv->Cr, 8);

	return 0;
}

static void resync(struct jdec_private *priv)
{
	int i;

	/* Init DC coefficients */
	for (i=0; i<COMPONENTS; i++)
		priv->component_infos[i].previous_DC = 0;

	priv->reservoir = 0;
	priv->nbits_in_reservoir = 0;
}

static int find_next_rst_marker(struct jdec_private *priv)
{
	int rst_marker_found = 0;
	int marker;
	const unsigned char *stream = priv->stream;

	/* Parse marker */
	while (!rst_marker_found)
	{
		while (*stream++ != 0xff)
		{
			if (stream >= priv->stream_end)
				error("EOF while search for a RST marker.");
		}
		/* Skip any padding ff byte (this is normal) */
		while (*stream == 0xff)
			stream++;

		marker = *stream++;
		if ((RST+priv->last_rst_marker_seen) == marker)
			rst_marker_found = 1;
		else if (marker >= RST && marker <= RST7)
			error("Wrong Reset marker found, abording");
		else if (marker == EOI)
			return 0;
	}
	trace("RST Marker %d found at offset %ld\n", priv->last_rst_marker_seen, stream - priv->stream_begin);

	priv->stream = stream;
	priv->last_rst_marker_seen++;
	priv->last_rst_marker_seen &= 7;

	return 0;
}

/*******************************************************************************
 *
 * Functions exported of the library.
 *
 * Note: Some applications can access directly to internal pointer of the
 * structure. It's is not recommended, but if you have many images to
 * uncompress with the same parameters, some functions can be called to speedup
 * the decoding.
 *
 ******************************************************************************/

/**
 * Allocate a new tinyjpeg decoder object.
 *
 * Before calling any other functions, an object need to be called.
 */
struct jdec_private *tinyjpeg_init(void)
{
	struct jdec_private *priv;

	priv = (struct jdec_private *)calloc(1, sizeof(struct jdec_private));
	if (priv == NULL)
		return NULL;
	return priv;
}

/**
 * Free a tinyjpeg object.
 *
 * No others function can be called after this one.
 */
void tinyjpeg_free(struct jdec_private *priv)
{
	free(priv);
}


/**
 * Create a new JPEG decode task
 *
 */
struct jdec_private *create_jdec_priv_task(struct jdec_private *priv, int tasknum)
{
	struct jdec_private *jdec_task;

	jdec_task = tinyjpeg_init();
	resync(priv);
	if (tasknum > 0){
		find_next_rst_marker(priv);
	}
	memcpy(jdec_task, priv, sizeof(struct jdec_private));
	
	jdec_task->mcus_posx = (tasknum * priv->restart_interval) % priv->mcus_in_width;
	jdec_task->mcus_posy = (tasknum * priv->restart_interval) / priv->mcus_in_width;
	
	return jdec_task;
}

/**
 * Initialize the tinyjpeg object and prepare the decoding of the stream.
 *
 * Check if the jpeg can be decoded with this jpeg decoder.
 * Fill some table used for preprocessing.
 */
int tinyjpeg_parse_header(struct jdec_private *priv, const unsigned char *buf, unsigned int size)
{
	int ret;

	/* Identify the file */
	if ((buf[0] != 0xFF) || (buf[1] != SOI))
		error("Not a JPG file ?\n");

	priv->stream_begin = buf+2;
	priv->stream_length = size-2;
	priv->stream_end = priv->stream_begin + priv->stream_length;

	ret = parse_JFIF(priv, priv->stream_begin);

	return ret;
}

/**
 * Decode and convert the jpeg image
 *
 * Note: components will be automaticaly allocated if no memory is attached.
 */
void tinyjpeg_decode_task(void *data, SlaveVP *animatingSlv )
{
    //struct jdec_private *priv, uint8_t* context
    tinyjpeg_decode_task_args* args = (tinyjpeg_decode_task_args*) data;
    
    struct jdec_private *priv = args->priv;
    uint8_t* context = args->context;
    
	// Make OmpSs not complain while compiling
	//(void) context;
  
	unsigned int x, xstride_by_mcu, ystride_by_mcu;
	unsigned int bytes_per_blocklines, bytes_per_mcu;
	decode_MCU_fct decode_MCU;
	convert_colorspace_fct convert_to_pixfmt;

	bytes_per_blocklines = priv->width * RGB_DEPTH;
	bytes_per_mcu = RGB_DEPTH*8;

	// Only 2x2 CU sizes are supported in this simple decoder
	decode_MCU = decode_MCU_2x2_3planes;
	convert_to_pixfmt = YCrCB_to_RGB24_2x2;
	xstride_by_mcu = MCU_X_STRIDE;
	ystride_by_mcu = MCU_Y_STRIDE;

	bytes_per_blocklines *= ystride_by_mcu;
	bytes_per_mcu *= xstride_by_mcu/8;

	/* Just the decode the image by macroblock */
	
	priv->plane = priv->components[0] + (priv->mcus_posy * bytes_per_blocklines) + (priv->mcus_posx * bytes_per_mcu);

	for (x=0; x < priv->restart_interval; x++) {
		if(decode_MCU(priv)) {
			fprintf(stderr, "%s\n", error_string);
		}
		convert_to_pixfmt(priv);

		priv->plane += bytes_per_mcu;
		priv->mcus_posx++;
		if (priv->mcus_posx >= priv->mcus_in_width){
			priv->mcus_posy++;
			priv->mcus_posx = 0;
			priv->plane += (bytes_per_blocklines - priv->width*3);
		}
	}
   VSs__end_task(animatingSlv);
}

const char *tinyjpeg_get_errorstring()
{
	return error_string;
}

void tinyjpeg_get_size(struct jdec_private *priv, unsigned int *width, unsigned int *height)
{
	*width = priv->width;
	*height = priv->height;
}

int tinyjpeg_get_components(struct jdec_private *priv, unsigned char **components)
{
	int i;
	for (i=0; priv->components[i] && i<COMPONENTS; i++)
		components[i] = priv->components[i];
	return 0;
}