/*-------------------------------------------------------------------------
 * drawElements Quality Program OpenGL ES 3.1 Module
 * -------------------------------------------------
 *
 * Copyright 2014 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *//*!
 * \file
 * \brief Shader atomic operation tests.
 *//*--------------------------------------------------------------------*/

#include "es31fShaderAtomicOpTests.hpp"
#include "gluShaderProgram.hpp"
#include "gluShaderUtil.hpp"
#include "gluRenderContext.hpp"
#include "gluObjectWrapper.hpp"
#include "gluProgramInterfaceQuery.hpp"
#include "tcuVector.hpp"
#include "tcuTestLog.hpp"
#include "tcuVectorUtil.hpp"
#include "tcuFormatUtil.hpp"
#include "deStringUtil.hpp"
#include "deRandom.hpp"
#include "glwFunctions.hpp"
#include "glwEnums.hpp"

#include <algorithm>
#include <set>

namespace deqp
{
namespace gles31
{
namespace Functional
{

using std::string;
using std::vector;
using tcu::TestLog;
using tcu::UVec3;
using std::set;
using namespace glu;

template<typename T, int Size>
static inline T product (const tcu::Vector<T, Size>& v)
{
	T res = v[0];
	for (int ndx = 1; ndx < Size; ndx++)
		res *= v[ndx];
	return res;
}

class ShaderAtomicOpCase : public TestCase
{
public:
							ShaderAtomicOpCase	(Context& context, const char* name, const char* funcName, AtomicOperandType operandType, DataType type, Precision precision, const UVec3& workGroupSize);
							~ShaderAtomicOpCase	(void);

	void					init				(void);
	void					deinit				(void);
	IterateResult			iterate				(void);

protected:
	virtual void			getInputs			(int numValues, int stride, void* inputs) const = 0;
	virtual bool			verify				(int numValues, int inputStride, const void* inputs, int outputStride, const void* outputs, int groupStride, const void* groupOutputs) const = 0;

	const string			m_funcName;
	const AtomicOperandType	m_operandType;
	const DataType			m_type;
	const Precision			m_precision;

	const UVec3				m_workGroupSize;
	const UVec3				m_numWorkGroups;

	deUint32				m_initialValue;

private:
							ShaderAtomicOpCase	(const ShaderAtomicOpCase& other);
	ShaderAtomicOpCase&		operator=			(const ShaderAtomicOpCase& other);

	ShaderProgram*			m_program;
};

ShaderAtomicOpCase::ShaderAtomicOpCase (Context& context, const char* name, const char* funcName, AtomicOperandType operandType, DataType type, Precision precision, const UVec3& workGroupSize)
	: TestCase			(context, name, funcName)
	, m_funcName		(funcName)
	, m_operandType		(operandType)
	, m_type			(type)
	, m_precision		(precision)
	, m_workGroupSize	(workGroupSize)
	, m_numWorkGroups	(4,4,4)
	, m_initialValue	(0)
	, m_program			(DE_NULL)
{
}

ShaderAtomicOpCase::~ShaderAtomicOpCase (void)
{
	ShaderAtomicOpCase::deinit();
}

void ShaderAtomicOpCase::init (void)
{
	const bool			isSSBO		= m_operandType == ATOMIC_OPERAND_BUFFER_VARIABLE;
	const char*			precName	= getPrecisionName(m_precision);
	const char*			typeName	= getDataTypeName(m_type);

	const DataType		outType		= isSSBO ? m_type : glu::TYPE_UINT;
	const char*			outTypeName	= getDataTypeName(outType);

	const deUint32		numValues	= product(m_workGroupSize)*product(m_numWorkGroups);
	std::ostringstream	src;

	src << "#version 310 es\n"
		<< "layout(local_size_x = " << m_workGroupSize.x()
		<< ", local_size_y = " << m_workGroupSize.y()
		<< ", local_size_z = " << m_workGroupSize.z() << ") in;\n"
		<< "layout(binding = 0) buffer InOut\n"
		<< "{\n"
		<< "	" << precName << " " << typeName << " inputValues[" << numValues << "];\n"
		<< "	" << precName << " " << outTypeName << " outputValues[" << numValues << "];\n"
		<< "	" << (isSSBO ? "coherent " : "") << precName << " " << outTypeName << " groupValues[" << product(m_numWorkGroups) << "];\n"
		<< "} sb_inout;\n";

	if (!isSSBO)
		src << "shared " << precName << " " << typeName << " s_var;\n";

	src << "\n"
		<< "void main (void)\n"
		<< "{\n"
		<< "	uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
		<< "	uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
		<< "	uint globalOffs = localSize*globalNdx;\n"
		<< "	uint offset     = globalOffs + gl_LocalInvocationIndex;\n"
		<< "\n";

	if (isSSBO)
	{
		DE_ASSERT(outType == m_type);
		src << "	sb_inout.outputValues[offset] = " << m_funcName << "(sb_inout.groupValues[globalNdx], sb_inout.inputValues[offset]);\n";
	}
	else
	{
		const string		castBeg	= outType != m_type ? (string(outTypeName) + "(") : string("");
		const char* const	castEnd	= outType != m_type ? ")" : "";

		src << "	if (gl_LocalInvocationIndex == 0u)\n"
			<< "		s_var = " << typeName << "(" << tcu::toHex(m_initialValue) << "u);\n"
			<< "	barrier();\n"
			<< "	" << precName << " " << typeName << " res = " << m_funcName << "(s_var, sb_inout.inputValues[offset]);\n"
			<< "	sb_inout.outputValues[offset] = " << castBeg << "res" << castEnd << ";\n"
			<< "	barrier();\n"
			<< "	if (gl_LocalInvocationIndex == 0u)\n"
			<< "		sb_inout.groupValues[globalNdx] = " << castBeg << "s_var" << castEnd << ";\n";
	}

	src << "}\n";

	DE_ASSERT(!m_program);
	m_program = new ShaderProgram(m_context.getRenderContext(), ProgramSources() << ComputeSource(src.str()));

	m_testCtx.getLog() << *m_program;

	if (!m_program->isOk())
	{
		delete m_program;
		m_program = DE_NULL;
		throw tcu::TestError("Compile failed");
	}
}

void ShaderAtomicOpCase::deinit (void)
{
	delete m_program;
	m_program = DE_NULL;
}

ShaderAtomicOpCase::IterateResult ShaderAtomicOpCase::iterate (void)
{
	const glw::Functions&		gl				= m_context.getRenderContext().getFunctions();
	const deUint32				program			= m_program->getProgram();
	const Buffer				inoutBuffer		(m_context.getRenderContext());
	const deUint32				blockNdx		= gl.getProgramResourceIndex(program, GL_SHADER_STORAGE_BLOCK, "InOut");
	const InterfaceBlockInfo	blockInfo		= getProgramInterfaceBlockInfo(gl, program, GL_SHADER_STORAGE_BLOCK, blockNdx);
	const deUint32				inVarNdx		= gl.getProgramResourceIndex(program, GL_BUFFER_VARIABLE, "InOut.inputValues[0]");
	const InterfaceVariableInfo	inVarInfo		= getProgramInterfaceVariableInfo(gl, program, GL_BUFFER_VARIABLE, inVarNdx);
	const deUint32				outVarNdx		= gl.getProgramResourceIndex(program, GL_BUFFER_VARIABLE, "InOut.outputValues[0]");
	const InterfaceVariableInfo	outVarInfo		= getProgramInterfaceVariableInfo(gl, program, GL_BUFFER_VARIABLE, outVarNdx);
	const deUint32				groupVarNdx		= gl.getProgramResourceIndex(program, GL_BUFFER_VARIABLE, "InOut.groupValues[0]");
	const InterfaceVariableInfo	groupVarInfo	= getProgramInterfaceVariableInfo(gl, program, GL_BUFFER_VARIABLE, groupVarNdx);
	const deUint32				numValues		= product(m_workGroupSize)*product(m_numWorkGroups);

	TCU_CHECK(inVarInfo.arraySize == numValues &&
			  outVarInfo.arraySize == numValues &&
			  groupVarInfo.arraySize == product(m_numWorkGroups));

	gl.useProgram(program);

	// Setup buffer.
	{
		vector<deUint8> bufData(blockInfo.dataSize);
		std::fill(bufData.begin(), bufData.end(), 0);

		getInputs((int)numValues, (int)inVarInfo.arrayStride, &bufData[0] + inVarInfo.offset);

		if (m_operandType == ATOMIC_OPERAND_BUFFER_VARIABLE)
		{
			for (deUint32 valNdx = 0; valNdx < product(m_numWorkGroups); valNdx++)
				*(deUint32*)(&bufData[0] + groupVarInfo.offset + groupVarInfo.arrayStride*valNdx) = m_initialValue;
		}

		gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *inoutBuffer);
		gl.bufferData(GL_SHADER_STORAGE_BUFFER, blockInfo.dataSize, &bufData[0], GL_STATIC_READ);
		gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, *inoutBuffer);
		GLU_EXPECT_NO_ERROR(gl.getError(), "Output buffer setup failed");
	}

	gl.dispatchCompute(m_numWorkGroups.x(), m_numWorkGroups.y(), m_numWorkGroups.z());

	// Read back and compare
	{
		const void*		resPtr		= gl.mapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, blockInfo.dataSize, GL_MAP_READ_BIT);
		bool			isOk		= true;

		GLU_EXPECT_NO_ERROR(gl.getError(), "glMapBufferRange()");
		TCU_CHECK(resPtr);

		isOk = verify((int)numValues,
					  (int)inVarInfo.arrayStride, (const deUint8*)resPtr + inVarInfo.offset,
					  (int)outVarInfo.arrayStride, (const deUint8*)resPtr + outVarInfo.offset,
					  (int)groupVarInfo.arrayStride, (const deUint8*)resPtr + groupVarInfo.offset);

		gl.unmapBuffer(GL_SHADER_STORAGE_BUFFER);
		GLU_EXPECT_NO_ERROR(gl.getError(), "glUnmapBuffer()");

		m_testCtx.setTestResult(isOk ? QP_TEST_RESULT_PASS	: QP_TEST_RESULT_FAIL,
								isOk ? "Pass"				: "Comparison failed");
	}

	return STOP;
}

class ShaderAtomicAddCase : public ShaderAtomicOpCase
{
public:
	ShaderAtomicAddCase (Context& context, const char* name, AtomicOperandType operandType, DataType type, Precision precision)
		: ShaderAtomicOpCase(context, name, "atomicAdd", operandType, type, precision, UVec3(3,2,1))
	{
		m_initialValue = 1;
	}

protected:
	void getInputs (int numValues, int stride, void* inputs) const
	{
		de::Random	rnd			(deStringHash(getName()));
		const int	maxVal		= m_precision == PRECISION_LOWP ? 2 : 32;
		const int	minVal		= 1;

		// \todo [2013-09-04 pyry] Negative values!

		for (int valNdx = 0; valNdx < numValues; valNdx++)
			*(int*)((deUint8*)inputs + stride*valNdx) = rnd.getInt(minVal, maxVal);
	}

	bool verify (int numValues, int inputStride, const void* inputs, int outputStride, const void* outputs, int groupStride, const void* groupOutputs) const
	{
		const int	workGroupSize	= (int)product(m_workGroupSize);
		const int	numWorkGroups	= numValues/workGroupSize;

		for (int groupNdx = 0; groupNdx < numWorkGroups; groupNdx++)
		{
			const int	groupOffset		= groupNdx*workGroupSize;
			const int	groupOutput		= *(const int*)((const deUint8*)groupOutputs + groupNdx*groupStride);
			set<int>	outValues;
			bool		maxFound		= false;
			int			valueSum		= (int)m_initialValue;

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
			{
				const int inputValue = *(const int*)((const deUint8*)inputs + inputStride*(groupOffset+localNdx));
				valueSum += inputValue;
			}

			if (groupOutput != valueSum)
			{
				m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ": expected sum " << valueSum << ", got " << groupOutput << TestLog::EndMessage;
				return false;
			}

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
			{
				const int	inputValue		= *(const int*)((const deUint8*)inputs + inputStride*(groupOffset+localNdx));
				const int	outputValue		= *(const int*)((const deUint8*)outputs + outputStride*(groupOffset+localNdx));

				if (!de::inRange(outputValue, (int)m_initialValue, valueSum-inputValue))
				{
					m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ", invocation " << localNdx
														   << ": expected value in range [" << m_initialValue << ", " << (valueSum-inputValue)
														   << "], got " << outputValue
									   << TestLog::EndMessage;
					return false;
				}

				if (outValues.find(outputValue) != outValues.end())
				{
					m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ", invocation " << localNdx
														   << ": found duplicate value " << outputValue
									   << TestLog::EndMessage;
					return false;
				}

				outValues.insert(outputValue);
				if (outputValue == valueSum-inputValue)
					maxFound = true;
			}

			if (!maxFound)
			{
				m_testCtx.getLog() << TestLog::Message << "ERROR: could not find maximum expected value from group " << groupNdx << TestLog::EndMessage;
				return false;
			}

			if (outValues.find((int)m_initialValue) == outValues.end())
			{
				m_testCtx.getLog() << TestLog::Message << "ERROR: could not find initial value from group " << groupNdx << TestLog::EndMessage;
				return false;
			}
		}

		return true;
	}
};


static int getPrecisionNumIntegerBits (glu::Precision precision)
{
	switch (precision)
	{
		case glu::PRECISION_HIGHP:		return 32;
		case glu::PRECISION_MEDIUMP:	return 16;
		case glu::PRECISION_LOWP:		return 9;
		default:
			DE_ASSERT(false);
			return 0;
	}
}

static deUint32 getPrecisionMask (int numPreciseBits)
{
	// \note: bit shift with larger or equal than var length is undefined, use 64 bit ints
	return (deUint32)((((deUint64)1u) << numPreciseBits) - 1) ;
}

static bool intEqualsAfterUintCast (deInt32 value, deUint32 casted, glu::Precision precision)
{
	// Bit format of 'casted' = [ uint -> highp uint promotion bits (0) ] [ sign extend bits (s) ] [ value bits ]
	//                                                                                             |--min len---|
	//                                                                    |---------------signed length---------|
	//                          |-------------------------------- highp uint length ----------------------------|

	const deUint32	reference		= (deUint32)value;
	const int		signBitOn		= value < 0;
	const int		numPreciseBits	= getPrecisionNumIntegerBits(precision);
	const deUint32	preciseMask		= getPrecisionMask(numPreciseBits);

	// Lowest N bits must match, N = minimum precision
	if ((reference & preciseMask) != (casted & preciseMask))
		return false;

	// Other lowest bits must match the sign and the remaining (topmost) if any must be 0
	for (int signedIntegerLength = numPreciseBits; signedIntegerLength <= 32; ++signedIntegerLength)
	{
		const deUint32 signBits = (signBitOn) ? (getPrecisionMask(signedIntegerLength)) : (0u);

		if ((signBits & ~preciseMask) == (casted & ~preciseMask))
			return true;
	}
	return false;
}

static bool containsAfterUintCast (const std::set<deInt32>& haystack, deUint32 needle, glu::Precision precision)
{
	for (std::set<deInt32>::const_iterator it = haystack.begin(); it != haystack.end(); ++it)
		if (intEqualsAfterUintCast(*it, needle, precision))
			return true;
	return false;
}

static bool containsAfterUintCast (const std::set<deUint32>& haystack, deInt32 needle, glu::Precision precision)
{
	for (std::set<deUint32>::const_iterator it = haystack.begin(); it != haystack.end(); ++it)
		if (intEqualsAfterUintCast(needle, *it, precision))
			return true;
	return false;
}

class ShaderAtomicMinCase : public ShaderAtomicOpCase
{
public:
	ShaderAtomicMinCase (Context& context, const char* name, AtomicOperandType operandType, DataType type, Precision precision)
		: ShaderAtomicOpCase(context, name, "atomicMin", operandType, type, precision, UVec3(3,2,1))
	{
		m_initialValue = m_precision == PRECISION_LOWP ? 100 : 1000;
	}

protected:
	void getInputs (int numValues, int stride, void* inputs) const
	{
		de::Random	rnd			(deStringHash(getName()));
		const bool	isSigned	= m_type == TYPE_INT;
		const int	maxVal		= m_precision == PRECISION_LOWP ? 100 : 1000;
		const int	minVal		= isSigned ? -maxVal : 0;

		for (int valNdx = 0; valNdx < numValues; valNdx++)
			*(int*)((deUint8*)inputs + stride*valNdx) = rnd.getInt(minVal, maxVal);
	}

	bool verify (int numValues, int inputStride, const void* inputs, int outputStride, const void* outputs, int groupStride, const void* groupOutputs) const
	{
		const int	workGroupSize	= (int)product(m_workGroupSize);
		const int	numWorkGroups	= numValues/workGroupSize;
		bool		anyError		= false;

		for (int groupNdx = 0; groupNdx < numWorkGroups; groupNdx++)
		{
			const int		groupOffset		= groupNdx*workGroupSize;
			const deUint32	groupOutput		= *(const deUint32*)((const deUint8*)groupOutputs + groupNdx*groupStride);
			set<deInt32>	inValues;
			set<deUint32>	outValues;
			int				minValue		= (int)m_initialValue;

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
			{
				const deInt32 inputValue = *(const deInt32*)((const deUint8*)inputs + inputStride*(groupOffset+localNdx));
				inValues.insert(inputValue);
				minValue = de::min(inputValue, minValue);
			}

			if (!intEqualsAfterUintCast(minValue, groupOutput, m_precision))
			{
				m_testCtx.getLog()
					<< TestLog::Message
					<< "ERROR: at group " << groupNdx
					<< ": expected minimum " << minValue << " (" << tcu::Format::Hex<8>((deUint32)minValue) << ")"
					<< ", got " << groupOutput << " (" << tcu::Format::Hex<8>(groupOutput) << ")"
					<< TestLog::EndMessage;
				anyError = true;
			}

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
			{
				const deUint32 outputValue = *(const deUint32*)((const deUint8*)outputs + outputStride*(groupOffset+localNdx));

				if (!containsAfterUintCast(inValues, outputValue, m_precision) &&
					!intEqualsAfterUintCast((deInt32)m_initialValue, outputValue, m_precision))
				{
					m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ", invocation " << localNdx
														   << ": found unexpected value " << outputValue
														   << " (" << tcu::Format::Hex<8>(outputValue) << ")"
									   << TestLog::EndMessage;
					anyError = true;
				}

				outValues.insert(outputValue);
			}

			if (!containsAfterUintCast(outValues, (int)m_initialValue, m_precision))
			{
				m_testCtx.getLog() << TestLog::Message << "ERROR: could not find initial value from group " << groupNdx << TestLog::EndMessage;
				anyError = true;
			}
		}

		return !anyError;
	}
};

class ShaderAtomicMaxCase : public ShaderAtomicOpCase
{
public:
	ShaderAtomicMaxCase (Context& context, const char* name, AtomicOperandType operandType, DataType type, Precision precision)
		: ShaderAtomicOpCase(context, name, "atomicMax", operandType, type, precision, UVec3(3,2,1))
	{
		const bool isSigned = m_type == TYPE_INT;
		m_initialValue = isSigned ? (m_precision == PRECISION_LOWP ? -100 : -1000) : 0;
	}

protected:
	void getInputs (int numValues, int stride, void* inputs) const
	{
		de::Random	rnd			(deStringHash(getName()));
		const bool	isSigned	= m_type == TYPE_INT;
		const int	maxVal		= m_precision == PRECISION_LOWP ? 100 : 1000;
		const int	minVal		= isSigned ? -maxVal : 0;

		for (int valNdx = 0; valNdx < numValues; valNdx++)
			*(int*)((deUint8*)inputs + stride*valNdx) = rnd.getInt(minVal, maxVal);
	}

	bool verify (int numValues, int inputStride, const void* inputs, int outputStride, const void* outputs, int groupStride, const void* groupOutputs) const
	{
		const int	workGroupSize	= (int)product(m_workGroupSize);
		const int	numWorkGroups	= numValues/workGroupSize;
		bool		anyError		= false;

		for (int groupNdx = 0; groupNdx < numWorkGroups; groupNdx++)
		{
			const int		groupOffset		= groupNdx*workGroupSize;
			const deUint32	groupOutput		= *(const deUint32*)((const deUint8*)groupOutputs + groupNdx*groupStride);
			set<int>		inValues;
			set<deUint32>	outValues;
			int				maxValue		= (int)m_initialValue;

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
			{
				const deInt32 inputValue = *(const deInt32*)((const deUint8*)inputs + inputStride*(groupOffset+localNdx));
				inValues.insert(inputValue);
				maxValue = de::max(maxValue, inputValue);
			}

			if (!intEqualsAfterUintCast(maxValue, groupOutput, m_precision))
			{
				m_testCtx.getLog()
					<< TestLog::Message
					<< "ERROR: at group " << groupNdx
					<< ": expected maximum " << maxValue << " (" << tcu::Format::Hex<8>((deUint32)maxValue) << ")"
					<< ", got " << groupOutput << " (" << tcu::Format::Hex<8>(groupOutput) << ")"
					<< TestLog::EndMessage;
				anyError = true;
			}

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
			{
				const deUint32 outputValue = *(const deUint32*)((const deUint8*)outputs + outputStride*(groupOffset+localNdx));

				if (!containsAfterUintCast(inValues, outputValue, m_precision) &&
					!intEqualsAfterUintCast((deInt32)m_initialValue, outputValue, m_precision))
				{
					m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ", invocation " << localNdx
														   << ": found unexpected value " << outputValue
														   << " (" << tcu::Format::Hex<8>(outputValue) << ")"
									   << TestLog::EndMessage;
					anyError = true;
				}

				outValues.insert(outputValue);
			}

			if (!containsAfterUintCast(outValues, (int)m_initialValue, m_precision))
			{
				m_testCtx.getLog() << TestLog::Message << "ERROR: could not find initial value from group " << groupNdx << TestLog::EndMessage;
				anyError = true;
			}
		}

		return !anyError;
	}
};

class ShaderAtomicAndCase : public ShaderAtomicOpCase
{
public:
	ShaderAtomicAndCase (Context& context, const char* name, AtomicOperandType operandType, DataType type, Precision precision)
		: ShaderAtomicOpCase(context, name, "atomicAnd", operandType, type, precision, UVec3(3,2,1))
	{
		const int		numBits		= m_precision == PRECISION_HIGHP ? 32 :
									  m_precision == PRECISION_MEDIUMP ? 16 : 8;
		const deUint32	valueMask	= numBits == 32 ? ~0u : (1u<<numBits)-1u;
		m_initialValue = ~((1u<<(numBits-1u)) | 1u) & valueMask; // All bits except lowest and highest set.
	}

protected:
	void getInputs (int numValues, int stride, void* inputs) const
	{
		de::Random		rnd				(deStringHash(getName()));
		const int		workGroupSize	= (int)product(m_workGroupSize);
		const int		numWorkGroups	= numValues/workGroupSize;
		const int		numBits			= m_precision == PRECISION_HIGHP ? 32 :
										  m_precision == PRECISION_MEDIUMP ? 16 : 8;
		const deUint32	valueMask		= numBits == 32 ? ~0u : (1u<<numBits)-1u;

		for (int groupNdx = 0; groupNdx < numWorkGroups; groupNdx++)
		{
			const int		groupOffset		= groupNdx*workGroupSize;
			const deUint32	groupMask		= 1<<rnd.getInt(0, numBits-2); // One bit is always set.

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
				*(deUint32*)((deUint8*)inputs + stride*(groupOffset+localNdx)) = (rnd.getUint32() & valueMask) | groupMask;
		}
	}

	bool verify (int numValues, int inputStride, const void* inputs, int outputStride, const void* outputs, int groupStride, const void* groupOutputs) const
	{
		const int	workGroupSize	= (int)product(m_workGroupSize);
		const int	numWorkGroups	= numValues/workGroupSize;

		for (int groupNdx = 0; groupNdx < numWorkGroups; groupNdx++)
		{
			const int		groupOffset		= groupNdx*workGroupSize;
			const deUint32	groupOutput		= *(const deUint32*)((const deUint8*)groupOutputs + groupNdx*groupStride);
			deUint32		expectedValue	= m_initialValue;

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
			{
				const deUint32 inputValue = *(const deUint32*)((const deUint8*)inputs + inputStride*(groupOffset+localNdx));
				expectedValue &= inputValue;
			}

			if (expectedValue != groupOutput)
			{
				m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ": expected " << tcu::toHex(expectedValue) << ", got " << tcu::toHex(groupOutput) << TestLog::EndMessage;
				return false;
			}

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
			{
				const deUint32 outputValue = *(const deUint32*)((const deUint8*)outputs + outputStride*(groupOffset+localNdx));

				if ((outputValue & ~m_initialValue) != 0)
				{
					m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ", invocation " << localNdx
														   << ": found unexpected value " << tcu::toHex(outputValue)
									   << TestLog::EndMessage;
					return false;
				}
			}
		}

		return true;
	}
};

class ShaderAtomicOrCase : public ShaderAtomicOpCase
{
public:
	ShaderAtomicOrCase (Context& context, const char* name, AtomicOperandType operandType, DataType type, Precision precision)
		: ShaderAtomicOpCase(context, name, "atomicOr", operandType, type, precision, UVec3(3,2,1))
	{
		m_initialValue = 1u; // Lowest bit set.
	}

protected:
	void getInputs (int numValues, int stride, void* inputs) const
	{
		de::Random		rnd				(deStringHash(getName()));
		const int		workGroupSize	= (int)product(m_workGroupSize);
		const int		numWorkGroups	= numValues/workGroupSize;
		const int		numBits			= m_precision == PRECISION_HIGHP ? 32 :
										  m_precision == PRECISION_MEDIUMP ? 16 : 8;

		for (int groupNdx = 0; groupNdx < numWorkGroups; groupNdx++)
		{
			const int groupOffset = groupNdx*workGroupSize;

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
				*(deUint32*)((deUint8*)inputs + stride*(groupOffset+localNdx)) = 1u<<rnd.getInt(0, numBits-1);
		}
	}

	bool verify (int numValues, int inputStride, const void* inputs, int outputStride, const void* outputs, int groupStride, const void* groupOutputs) const
	{
		const int	workGroupSize	= (int)product(m_workGroupSize);
		const int	numWorkGroups	= numValues/workGroupSize;

		for (int groupNdx = 0; groupNdx < numWorkGroups; groupNdx++)
		{
			const int		groupOffset		= groupNdx*workGroupSize;
			const deUint32	groupOutput		= *(const deUint32*)((const deUint8*)groupOutputs + groupNdx*groupStride);
			deUint32		expectedValue	= m_initialValue;

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
			{
				const deUint32 inputValue = *(const deUint32*)((const deUint8*)inputs + inputStride*(groupOffset+localNdx));
				expectedValue |= inputValue;
			}

			if (expectedValue != groupOutput)
			{
				m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ": expected " << tcu::toHex(expectedValue) << ", got " << tcu::toHex(groupOutput) << TestLog::EndMessage;
				return false;
			}

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
			{
				const deUint32 outputValue = *(const deUint32*)((const deUint8*)outputs + outputStride*(groupOffset+localNdx));

				if ((outputValue & m_initialValue) == 0)
				{
					m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ", invocation " << localNdx
														   << ": found unexpected value " << tcu::toHex(outputValue)
									   << TestLog::EndMessage;
					return false;
				}
			}
		}

		return true;
	}
};

class ShaderAtomicXorCase : public ShaderAtomicOpCase
{
public:
	ShaderAtomicXorCase (Context& context, const char* name, AtomicOperandType operandType, DataType type, Precision precision)
		: ShaderAtomicOpCase(context, name, "atomicXor", operandType, type, precision, UVec3(3,2,1))
	{
		m_initialValue = 0;
	}

protected:
	void getInputs (int numValues, int stride, void* inputs) const
	{
		de::Random		rnd				(deStringHash(getName()));
		const int		workGroupSize	= (int)product(m_workGroupSize);
		const int		numWorkGroups	= numValues/workGroupSize;

		for (int groupNdx = 0; groupNdx < numWorkGroups; groupNdx++)
		{
			const int groupOffset = groupNdx*workGroupSize;

			// First uses random bit-pattern.
			*(deUint32*)((deUint8*)inputs + stride*(groupOffset)) = rnd.getUint32();

			// Rest have either all or no bits set.
			for (int localNdx = 1; localNdx < workGroupSize; localNdx++)
				*(deUint32*)((deUint8*)inputs + stride*(groupOffset+localNdx)) = rnd.getBool() ? ~0u : 0u;
		}
	}

	bool verify (int numValues, int inputStride, const void* inputs, int outputStride, const void* outputs, int groupStride, const void* groupOutputs) const
	{
		const int		workGroupSize	= (int)product(m_workGroupSize);
		const int		numWorkGroups	= numValues/workGroupSize;
		const int		numBits			= m_precision == PRECISION_HIGHP ? 32 :
										  m_precision == PRECISION_MEDIUMP ? 16 : 8;
		const deUint32	compareMask		= numBits == 32 ? ~0u : (1u<<numBits)-1u;

		for (int groupNdx = 0; groupNdx < numWorkGroups; groupNdx++)
		{
			const int		groupOffset		= groupNdx*workGroupSize;
			const deUint32	groupOutput		= *(const deUint32*)((const deUint8*)groupOutputs + groupNdx*groupStride);
			const deUint32	randomValue		= *(const int*)((const deUint8*)inputs + inputStride*groupOffset);
			const deUint32	expected0		= randomValue ^ 0u;
			const deUint32	expected1		= randomValue ^ ~0u;
			int				numXorZeros		= (m_initialValue == 0) ? 1 : 0;

			for (int localNdx = 1; localNdx < workGroupSize; localNdx++)
			{
				const deUint32 inputValue = *(const deUint32*)((const deUint8*)inputs + inputStride*(groupOffset+localNdx));
				if (inputValue == 0)
					numXorZeros += 1;
			}

			const deUint32 expected = (numXorZeros%2 == 0) ? expected0 : expected1;

			if ((groupOutput & compareMask) != (expected & compareMask))
			{
				m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ": expected " << tcu::toHex(expected0)
													   << " or " << tcu::toHex(expected1) << " (compare mask " << tcu::toHex(compareMask)
													   << "), got " << tcu::toHex(groupOutput) << TestLog::EndMessage;
				return false;
			}

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
			{
				const deUint32 outputValue = *(const deUint32*)((const deUint8*)outputs + outputStride*(groupOffset+localNdx));

				if ((outputValue&compareMask) != 0 &&
					(outputValue&compareMask) != compareMask &&
					(outputValue&compareMask) != (expected0&compareMask) &&
					(outputValue&compareMask) != (expected1&compareMask))
				{
					m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ", invocation " << localNdx
														   << ": found unexpected value " << tcu::toHex(outputValue)
									   << TestLog::EndMessage;
					return false;
				}
			}
		}

		return true;
	}
};

class ShaderAtomicExchangeCase : public ShaderAtomicOpCase
{
public:
	ShaderAtomicExchangeCase (Context& context, const char* name, AtomicOperandType operandType, DataType type, Precision precision)
		: ShaderAtomicOpCase(context, name, "atomicExchange", operandType, type, precision, UVec3(3,2,1))
	{
		m_initialValue = 0;
	}

protected:
	void getInputs (int numValues, int stride, void* inputs) const
	{
		const int	workGroupSize	= (int)product(m_workGroupSize);
		const int	numWorkGroups	= numValues/workGroupSize;

		for (int groupNdx = 0; groupNdx < numWorkGroups; groupNdx++)
		{
			const int groupOffset = groupNdx*workGroupSize;

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
				*(int*)((deUint8*)inputs + stride*(groupOffset+localNdx)) = localNdx+1;
		}
	}

	bool verify (int numValues, int inputStride, const void* inputs, int outputStride, const void* outputs, int groupStride, const void* groupOutputs) const
	{
		const int	workGroupSize	= (int)product(m_workGroupSize);
		const int	numWorkGroups	= numValues/workGroupSize;

		DE_UNREF(inputStride && inputs);

		for (int groupNdx = 0; groupNdx < numWorkGroups; groupNdx++)
		{
			const int	groupOffset		= groupNdx*workGroupSize;
			const int	groupOutput		= *(const int*)((const deUint8*)groupOutputs + groupNdx*groupStride);
			set<int>	usedValues;

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
			{
				const int outputValue = *(const int*)((const deUint8*)outputs + outputStride*(groupOffset+localNdx));

				if (!de::inRange(outputValue, 0, workGroupSize) || usedValues.find(outputValue) != usedValues.end())
				{
					m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ", invocation " << localNdx
														   << ": found unexpected value " << outputValue
									   << TestLog::EndMessage;
					return false;
				}
				usedValues.insert(outputValue);
			}

			if (!de::inRange(groupOutput, 0, workGroupSize) || usedValues.find(groupOutput) != usedValues.end())
			{
				m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ": unexpected final value" << groupOutput << TestLog::EndMessage;
				return false;
			}
		}

		return true;
	}
};

class ShaderAtomicCompSwapCase : public TestCase
{
public:
									ShaderAtomicCompSwapCase	(Context& context, const char* name, AtomicOperandType operandType, DataType type, Precision precision);
									~ShaderAtomicCompSwapCase	(void);

	void							init						(void);
	void							deinit						(void);
	IterateResult					iterate						(void);

protected:

private:
									ShaderAtomicCompSwapCase	(const ShaderAtomicCompSwapCase& other);
	ShaderAtomicCompSwapCase&		operator=					(const ShaderAtomicCompSwapCase& other);

	const AtomicOperandType			m_operandType;
	const DataType					m_type;
	const Precision					m_precision;

	const UVec3						m_workGroupSize;
	const UVec3						m_numWorkGroups;

	ShaderProgram*					m_program;
};

ShaderAtomicCompSwapCase::ShaderAtomicCompSwapCase (Context& context, const char* name, AtomicOperandType operandType, DataType type, Precision precision)
	: TestCase			(context, name, "atomicCompSwap() Test")
	, m_operandType		(operandType)
	, m_type			(type)
	, m_precision		(precision)
	, m_workGroupSize	(3,2,1)
	, m_numWorkGroups	(4,4,4)
	, m_program			(DE_NULL)
{
}

ShaderAtomicCompSwapCase::~ShaderAtomicCompSwapCase (void)
{
	ShaderAtomicCompSwapCase::deinit();
}

void ShaderAtomicCompSwapCase::init (void)
{
	const bool			isSSBO		= m_operandType == ATOMIC_OPERAND_BUFFER_VARIABLE;
	const char*			precName	= getPrecisionName(m_precision);
	const char*			typeName	= getDataTypeName(m_type);
	const deUint32		numValues	= product(m_workGroupSize)*product(m_numWorkGroups);
	std::ostringstream	src;

	src << "#version 310 es\n"
		<< "layout(local_size_x = " << m_workGroupSize.x()
		<< ", local_size_y = " << m_workGroupSize.y()
		<< ", local_size_z = " << m_workGroupSize.z() << ") in;\n"
		<< "layout(binding = 0) buffer InOut\n"
		<< "{\n"
		<< "	" << precName << " " << typeName << " compareValues[" << numValues << "];\n"
		<< "	" << precName << " " << typeName << " exchangeValues[" << numValues << "];\n"
		<< "	" << precName << " " << typeName << " outputValues[" << numValues << "];\n"
		<< "	" << (isSSBO ? "coherent " : "") << precName << " " << typeName << " groupValues[" << product(m_numWorkGroups) << "];\n"
		<< "} sb_inout;\n";

	if (!isSSBO)
		src << "shared " << precName << " " << typeName << " s_var;\n";

	src << "\n"
		<< "void main (void)\n"
		<< "{\n"
		<< "	uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
		<< "	uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
		<< "	uint globalOffs = localSize*globalNdx;\n"
		<< "	uint offset     = globalOffs + gl_LocalInvocationIndex;\n"
		<< "\n";

	if (!isSSBO)
	{
		src << "	if (gl_LocalInvocationIndex == 0u)\n"
			<< "		s_var = " << typeName << "(" << 0 << ");\n"
			<< "\n";
	}

	src << "	" << precName << " " << typeName << " compare = sb_inout.compareValues[offset];\n"
		<< "	" << precName << " " << typeName << " exchange = sb_inout.exchangeValues[offset];\n"
		<< "	" << precName << " " << typeName << " result;\n"
		<< "	bool swapDone = false;\n"
		<< "\n"
		<< "	for (uint ndx = 0u; ndx < localSize; ndx++)\n"
		<< "	{\n"
		<< "		barrier();\n"
		<< "		if (!swapDone)\n"
		<< "		{\n"
		<< "			result = atomicCompSwap(" << (isSSBO ? "sb_inout.groupValues[globalNdx]" : "s_var") << ", compare, exchange);\n"
		<< "			if (result == compare)\n"
		<< "				swapDone = true;\n"
		<< "		}\n"
		<< "	}\n"
		<< "\n"
		<< "	sb_inout.outputValues[offset] = result;\n";

	if (!isSSBO)
	{
		src << "	barrier();\n"
			<< "	if (gl_LocalInvocationIndex == 0u)\n"
			<< "		sb_inout.groupValues[globalNdx] = s_var;\n";
	}

	src << "}\n";

	DE_ASSERT(!m_program);
	m_program = new ShaderProgram(m_context.getRenderContext(), ProgramSources() << ComputeSource(src.str()));

	m_testCtx.getLog() << *m_program;

	if (!m_program->isOk())
	{
		delete m_program;
		m_program = DE_NULL;
		throw tcu::TestError("Compile failed");
	}
}

void ShaderAtomicCompSwapCase::deinit (void)
{
	delete m_program;
	m_program = DE_NULL;
}

ShaderAtomicOpCase::IterateResult ShaderAtomicCompSwapCase::iterate (void)
{
	const glw::Functions&		gl				= m_context.getRenderContext().getFunctions();
	const deUint32				program			= m_program->getProgram();
	const Buffer				inoutBuffer		(m_context.getRenderContext());
	const deUint32				blockNdx		= gl.getProgramResourceIndex(program, GL_SHADER_STORAGE_BLOCK, "InOut");
	const InterfaceBlockInfo	blockInfo		= getProgramInterfaceBlockInfo(gl, program, GL_SHADER_STORAGE_BLOCK, blockNdx);
	const deUint32				cmpVarNdx		= gl.getProgramResourceIndex(program, GL_BUFFER_VARIABLE, "InOut.compareValues[0]");
	const InterfaceVariableInfo	cmpVarInfo		= getProgramInterfaceVariableInfo(gl, program, GL_BUFFER_VARIABLE, cmpVarNdx);
	const deUint32				exhVarNdx		= gl.getProgramResourceIndex(program, GL_BUFFER_VARIABLE, "InOut.exchangeValues[0]");
	const InterfaceVariableInfo	exhVarInfo		= getProgramInterfaceVariableInfo(gl, program, GL_BUFFER_VARIABLE, exhVarNdx);
	const deUint32				outVarNdx		= gl.getProgramResourceIndex(program, GL_BUFFER_VARIABLE, "InOut.outputValues[0]");
	const InterfaceVariableInfo	outVarInfo		= getProgramInterfaceVariableInfo(gl, program, GL_BUFFER_VARIABLE, outVarNdx);
	const deUint32				groupVarNdx		= gl.getProgramResourceIndex(program, GL_BUFFER_VARIABLE, "InOut.groupValues[0]");
	const InterfaceVariableInfo	groupVarInfo	= getProgramInterfaceVariableInfo(gl, program, GL_BUFFER_VARIABLE, groupVarNdx);
	const deUint32				numValues		= product(m_workGroupSize)*product(m_numWorkGroups);

	TCU_CHECK(cmpVarInfo.arraySize == numValues &&
			  exhVarInfo.arraySize == numValues &&
			  outVarInfo.arraySize == numValues &&
			  groupVarInfo.arraySize == product(m_numWorkGroups));

	gl.useProgram(program);

	// \todo [2013-09-05 pyry] Use randomized input values!

	// Setup buffer.
	{
		const deUint32	workGroupSize	= product(m_workGroupSize);
		vector<deUint8>	bufData			(blockInfo.dataSize);

		std::fill(bufData.begin(), bufData.end(), 0);

		for (deUint32 ndx = 0; ndx < numValues; ndx++)
			*(deUint32*)(&bufData[0] + cmpVarInfo.offset + cmpVarInfo.arrayStride*ndx) = ndx%workGroupSize;

		for (deUint32 ndx = 0; ndx < numValues; ndx++)
			*(deUint32*)(&bufData[0] + exhVarInfo.offset + exhVarInfo.arrayStride*ndx) = (ndx%workGroupSize)+1;

		gl.bindBuffer(GL_SHADER_STORAGE_BUFFER, *inoutBuffer);
		gl.bufferData(GL_SHADER_STORAGE_BUFFER, blockInfo.dataSize, &bufData[0], GL_STATIC_READ);
		gl.bindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, *inoutBuffer);
		GLU_EXPECT_NO_ERROR(gl.getError(), "Output buffer setup failed");
	}

	gl.dispatchCompute(m_numWorkGroups.x(), m_numWorkGroups.y(), m_numWorkGroups.z());

	// Read back and compare
	{
		const void*		resPtr			= gl.mapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, blockInfo.dataSize, GL_MAP_READ_BIT);
		const int		numWorkGroups	= (int)product(m_numWorkGroups);
		const int		workGroupSize	= (int)product(m_workGroupSize);
		bool			isOk			= true;

		GLU_EXPECT_NO_ERROR(gl.getError(), "glMapBufferRange()");
		TCU_CHECK(resPtr);

		for (int groupNdx = 0; groupNdx < numWorkGroups; groupNdx++)
		{
			const int	groupOffset		= groupNdx*workGroupSize;
			const int	groupOutput		= *(const int*)((const deUint8*)resPtr + groupVarInfo.offset + groupNdx*groupVarInfo.arrayStride);

			for (int localNdx = 0; localNdx < workGroupSize; localNdx++)
			{
				const int	refValue		= localNdx;
				const int	outputValue		= *(const int*)((const deUint8*)resPtr + outVarInfo.offset + outVarInfo.arrayStride*(groupOffset+localNdx));

				if (outputValue != refValue)
				{
					m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ", invocation " << localNdx
														   << ": expected " << refValue << ", got " << outputValue
									   << TestLog::EndMessage;
					isOk = false;
					break;
				}
			}

			if (groupOutput != workGroupSize)
			{
				m_testCtx.getLog() << TestLog::Message << "ERROR: at group " << groupNdx << ": expected" << workGroupSize << ", got " << groupOutput << TestLog::EndMessage;
				isOk = false;
				break;
			}
		}

		gl.unmapBuffer(GL_SHADER_STORAGE_BUFFER);
		GLU_EXPECT_NO_ERROR(gl.getError(), "glUnmapBuffer()");

		m_testCtx.setTestResult(isOk ? QP_TEST_RESULT_PASS	: QP_TEST_RESULT_FAIL,
								isOk ? "Pass"				: "Comparison failed");
	}

	return STOP;
}

ShaderAtomicOpTests::ShaderAtomicOpTests (Context& context, const char* name, AtomicOperandType operandType)
	: TestCaseGroup	(context, name, "Atomic Operation Tests")
	, m_operandType	(operandType)
{
}

ShaderAtomicOpTests::~ShaderAtomicOpTests (void)
{
}

template<typename T>
static tcu::TestCaseGroup* createAtomicOpGroup (Context& context, AtomicOperandType operandType, const char* groupName)
{
	tcu::TestCaseGroup *const group = new tcu::TestCaseGroup(context.getTestContext(), groupName, (string("Atomic ") + groupName).c_str());
	try
	{
		for (int precNdx = 0; precNdx < PRECISION_LAST; precNdx++)
		{
			for (int typeNdx = 0; typeNdx < 2; typeNdx++)
			{
				const Precision		precision		= Precision(precNdx);
				const DataType		type			= typeNdx > 0 ? TYPE_INT : TYPE_UINT;
				const string		caseName		= string(getPrecisionName(precision)) + "_" + getDataTypeName(type);

				group->addChild(new T(context, caseName.c_str(), operandType, type, precision));
			}
		}

		return group;
	}
	catch (...)
	{
		delete group;
		throw;
	}
}

void ShaderAtomicOpTests::init (void)
{
	addChild(createAtomicOpGroup<ShaderAtomicAddCase>		(m_context, m_operandType, "add"));
	addChild(createAtomicOpGroup<ShaderAtomicMinCase>		(m_context, m_operandType, "min"));
	addChild(createAtomicOpGroup<ShaderAtomicMaxCase>		(m_context, m_operandType, "max"));
	addChild(createAtomicOpGroup<ShaderAtomicAndCase>		(m_context, m_operandType, "and"));
	addChild(createAtomicOpGroup<ShaderAtomicOrCase>		(m_context, m_operandType, "or"));
	addChild(createAtomicOpGroup<ShaderAtomicXorCase>		(m_context, m_operandType, "xor"));
	addChild(createAtomicOpGroup<ShaderAtomicExchangeCase>	(m_context, m_operandType, "exchange"));
	addChild(createAtomicOpGroup<ShaderAtomicCompSwapCase>	(m_context, m_operandType, "compswap"));
}

} // Functional
} // gles31
} // deqp
