diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/.gitrepo b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/.gitrepo
new file mode 100644
index 0000000..7966bf8
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/.gitrepo
@@ -0,0 +1,12 @@
+; DO NOT EDIT (unless you know what you are doing)
+;
+; This subdirectory is a git "subrepo", and this file is maintained by the
+; git-subrepo command. See https://github.com/ingydotnet/git-subrepo#readme
+;
+[subrepo]
+	remote = https://github.com/V-Sekai/preconditioner-for-cloth-and-deformable-body-simulation.git
+	branch = master
+	commit = bf3d095a8636fa541990e38c1fce796515b7b119
+	parent = c83e46325ad184347f259ac9ce3c43388d32f6e9
+	method = merge
+	cmdver = 0.4.6
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/LICENSE b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/LICENSE
new file mode 100644
index 0000000..4d15781
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/LICENSE
@@ -0,0 +1,24 @@
+Copyright (C) 2002 - 2022,
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   3. The names of its contributors may not be used to endorse or promote
+      products derived from this software without specific prior written
+      permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/README.md b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/README.md
new file mode 100644
index 0000000..572300b
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/README.md
@@ -0,0 +1,7 @@
+# preconditioner-for-cloth-and-deformable-body-simulation
+
+This is a mirror of https://wanghmin.github.io/publications.html
+
+## Entry point
+
+See `SeSchwarzPreconditioner.h` for entry points.
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeAabb.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeAabb.h
new file mode 100644
index 0000000..ba41f4f
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeAabb.h
@@ -0,0 +1,185 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "SeMath.h"
+#include <cfloat>
+
+SE_NAMESPACE_BEGIN
+
+/*************************************************************************
+******************************    SeAabb    ******************************
+*************************************************************************/
+ 
+/**
+ *	@brief		Axis-align Bounding Box.
+ */
+template<typename VecType> struct SeAabb
+{
+	static_assert(std::is_floating_point<typename VecType::value_type>::value, "Only support floating type currently.");
+
+public:
+
+	//!	@brief	Default constructor.
+	 SeAabb() : Lower(FLT_MAX), Upper(-FLT_MAX) {}
+
+	//!	@brief	Constructed by a given point.
+	 SeAabb(const VecType & Point) : Lower(Point), Upper(Point) {}
+
+	//!	@brief	Constructed by a paired points explicitly.
+	 explicit SeAabb(const VecType & P1, const VecType & P2) : Lower(Math::Min(P1, P2)), Upper(Math::Max(P1, P2)) {}
+
+	//!	@brief	Constructed by a paired points explicitly.
+	 explicit SeAabb(const VecType & P1, const VecType & P2, const VecType & P3) : Lower(Math::Min(P1, P2, P3)), Upper(Math::Max(P1, P2, P3)) {}
+
+    
+    
+
+public:
+
+	 SeAabb<VecType> operator+(const SeAabb<VecType> & Other) const
+	{
+		return SeAabb<VecType>(Math::Min(Lower, Other.Lower), Math::Max(Upper, Other.Upper));
+	}
+
+	 SeAabb<VecType> operator+(const VecType & Point) const
+	{
+		return SeAabb<VecType>(Math::Min(Lower, Point), Math::Max(Upper, Point));
+	}
+
+	 void operator+=(const SeAabb<VecType> & Other)
+	{
+		Lower = Math::Min(Lower, Other.Lower);		Upper = Math::Max(Upper, Other.Upper);
+	}
+
+	 void operator+=(const VecType & Point)
+	{
+		Lower = Math::Min(Lower, Point);		Upper = Math::Max(Upper, Point);
+	}
+
+	 void Enlarge(float Amount)
+	{
+		Lower -= Amount;		Upper += Amount;
+	}
+
+	 VecType Center() const
+	{
+		return Math::Average(Lower, Upper);
+	}
+
+	 VecType Extent() const
+	{
+		return Upper - Lower;
+	}
+
+public:
+
+	VecType Lower, Upper;
+};
+
+
+/*************************************************************************
+***********************    IsInside<AABB-AABB>    ************************
+*************************************************************************/
+
+template<typename VecT> 
+static SE_INLINE bool IsContain(const SeAabb<VecT>& aabb, const VecT& P) 
+{ 
+	const int componentCount = VecT::Elements;
+	for (int i = 0; i < componentCount; i++)
+	{
+		if (P[i]<aabb.Lower[i] || P[i]>aabb.Upper[i])
+		{
+			return false;
+		}
+	}
+	return true; 
+
+}
+template<typename VecT> 
+static SE_INLINE bool IsContain(const SeAabb<VecT>& aabb, const VecT& P,float radium) 
+{ 
+	auto enlargedAabb = aabb;
+	enlargedAabb.Upper += radium;
+	enlargedAabb.Lower -= radium;
+	return IsContain(aabb, enlargedAabb);
+}
+
+template<> 
+static SE_INLINE bool IsContain(const SeAabb<Float2>& aabb, const Float2& P) 
+{ 
+	return !((aabb.Upper.x < P.x) || (aabb.Upper.y < P.y) ||
+			 (aabb.Lower.x > P.x) || (aabb.Lower.y > P.y));
+}
+
+template<> 
+static SE_INLINE bool IsContain(const SeAabb<Float3>& aabb, const Float3& P) 
+{ 
+	return !((aabb.Upper.x < P.x) || (aabb.Upper.y < P.y) || (aabb.Upper.z < P.z) ||
+			 (aabb.Lower.x > P.x) || (aabb.Lower.y > P.y) || (aabb.Lower.z > P.z));
+}
+
+/*************************************************************************
+***********************    IsInside<AABB-Line>    ************************
+*************************************************************************/
+
+template<typename VecT>
+static  bool IsIntersect(const SeAabb<VecT>& aabb, const VecT& Pa, const VecT& Pb, const VecT& iv);
+
+template<typename VecT>
+static  bool IsIntersect(const SeAabb<VecT>& aabb, const VecT& Pa, const VecT& Pb);
+
+
+template<>
+static SE_INLINE bool IsIntersect(const SeAabb<Float3>& aabb, const Float3& Pa, const Float3& Pb) 
+{
+	Float3 Dir = Float3(Pb - Pa);
+
+	if (Dir.x == 0.0f)		Dir.x = 1e-6f;
+	if (Dir.y == 0.0f)		Dir.y = 1e-6f;
+	if (Dir.z == 0.0f)		Dir.z = 1e-6f;
+
+	Float3 invDir = 1.0f / Dir;
+	Float3 Left = Float3(aabb.Lower - Pa) * invDir;
+	Float3 Right = Float3(aabb.Upper - Pa) * invDir;
+
+	Float2 T1 = Float2(Math::Min(Left.x, Right.x), Math::Max(Left.x, Right.x));
+	Float2 T2 = Float2(Math::Min(Left.y, Right.y), Math::Max(Left.y, Right.y));
+	Float2 T3 = Float2(Math::Min(Left.z, Right.z), Math::Max(Left.z, Right.z));
+
+	float range0 = Math::Max(T1.x, T2.x, T3.x, 0.0f);
+	float range1 = Math::Min(T1.y, T2.y, T3.y, 1.0f);
+
+	return range0 < range1;
+}
+
+
+template<typename VecT>
+static SE_INLINE bool IsOverlap(const SeAabb<VecT>& aabb, const SeAabb<VecT>& P) { return false; }
+
+
+SE_NAMESPACE_END
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeAabbSimd.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeAabbSimd.h
new file mode 100644
index 0000000..890da40
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeAabbSimd.h
@@ -0,0 +1,148 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "SeMathSimd.h"
+
+#include "SeAabb.h"
+
+#include <limits>
+
+SE_NAMESPACE_BEGIN
+
+/*************************************************************************
+******************************    SeAabb    ******************************
+*************************************************************************/
+ 
+/**
+ *	@brief		Axis-align Bounding Box.
+ */
+
+template<> struct SeAabb<Simd3f>
+{
+
+public:
+
+	//!	@brief	Default constructor.
+	SeAabb() : Lower(FLT_MAX), Upper(-FLT_MAX) {}
+
+	//!	@brief	Constructed by a given point.
+	SeAabb(const Simd3f & Point) : Lower(Point), Upper(Point) {}
+
+	//!	@brief	Constructed by a paired points explicitly.
+	explicit SeAabb(const Simd3f & P1, const Simd3f & P2) : Lower(Math::Min(P1, P2)), Upper(Math::Max(P1, P2)) {}
+
+public:
+
+	SeAabb operator+(const SeAabb & Other) const
+	{
+		return SeAabb(Math::Min(Lower, Other.Lower), Math::Max(Upper, Other.Upper));
+	}
+
+	SeAabb operator+(const Simd3f & Point) const
+	{
+		return SeAabb(Math::Min(Lower, Point), Math::Max(Upper, Point));
+	}
+
+	void operator += (const SeAabb & Other)
+	{
+		Lower = Math::Min(Lower, Other.Lower);		Upper = Math::Max(Upper, Other.Upper);
+	}
+
+	void operator += (const Simd3f & Point)
+	{
+		Lower = Math::Min(Lower, Point);		Upper = Math::Max(Upper, Point);
+	}
+
+	void Enlarge(float Amount)
+	{
+		Lower -= Amount;		Upper += Amount;
+	}
+
+	Simd3f Center() const
+	{
+		return Math::Average(Lower, Upper);
+	}
+
+	Simd3f Extent() const
+	{
+		return Upper - Lower;
+	}
+
+public:
+
+	Simd3f Lower, Upper;
+};
+
+
+
+template<>
+static SE_INLINE bool IsContain(const SeAabb<Simd3f>& aabb, const Simd3f& P) 
+{ 
+	return !((aabb.Upper.x < P.x) || (aabb.Upper.y < P.y) || (aabb.Upper.z < P.z) ||
+			 (aabb.Lower.x > P.x) || (aabb.Lower.y > P.y) || (aabb.Lower.z > P.z));
+}
+
+template<> 
+static SE_INLINE bool IsContain(const SeAabb<Simd3f>& aabb, const Simd3f& P, float radium)
+{ 
+	return !((aabb.Upper.x < P.x - radium) || (aabb.Upper.y < P.y - radium) || (aabb.Upper.z < P.z - radium) ||
+		(aabb.Lower.x > P.x + radium) || (aabb.Lower.y > P.y + radium) || (aabb.Lower.z > P.z + radium));
+}
+
+template<>
+static SE_INLINE bool IsOverlap(const SeAabb<Simd3f>& aabb, const SeAabb<Simd3f>& P)
+{
+	return !((aabb.Upper.x < P.Lower.x) || (aabb.Upper.y < P.Lower.y) || (aabb.Upper.z < P.Lower.z) ||
+		(aabb.Lower.x > P.Upper.x) || (aabb.Lower.y > P.Upper.y) || (aabb.Lower.z > P.Upper.z));
+}
+
+
+template<>
+static SE_INLINE bool IsIntersect(const SeAabb<Simd3f>& aabb, const Simd3f& Pa, const Simd3f& Pb) 
+{
+	Simd3f Dir = Simd3f(Pb - Pa);
+
+	if (Dir.x == 0.0f)		Dir.x = 1e-6f;
+	if (Dir.y == 0.0f)		Dir.y = 1e-6f;
+	if (Dir.z == 0.0f)		Dir.z = 1e-6f;
+
+	Simd3f invDir = 1.0f / Dir;
+	Simd3f Left = Simd3f(aabb.Lower - Pa) * invDir;
+	Simd3f Right = Simd3f(aabb.Upper - Pa) * invDir;
+
+	Float2 T1 = Float2(Math::Min(Left.x, Right.x), Math::Max(Left.x, Right.x));
+	Float2 T2 = Float2(Math::Min(Left.y, Right.y), Math::Max(Left.y, Right.y));
+	Float2 T3 = Float2(Math::Min(Left.z, Right.z), Math::Max(Left.z, Right.z));
+
+	float range0 = Math::Max(T1.x, T2.x, T3.x, 0.0f);
+	float range1 = Math::Min(T1.y, T2.y, T3.y, 1.0f);
+
+	return range0 < range1;
+}
+
+SE_NAMESPACE_END
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeArray2D.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeArray2D.h
new file mode 100644
index 0000000..d160c92
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeArray2D.h
@@ -0,0 +1,127 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "SePreDefine.h"
+
+SE_NAMESPACE_BEGIN
+
+/*************************************************************************
+****************************    SeArray2D    *****************************
+*************************************************************************/
+
+//!	@brief	��ά��������
+template<typename Type> class SeArray2D
+{
+
+public:
+
+	//!	@brief	Default constructor.
+	SeArray2D() : m_Rows(0), m_Columns(0) {}
+
+	//!	@brief	Construct and resize.
+	explicit SeArray2D(size_t _Rows, size_t _Columns) : m_Rows(_Rows), m_Columns(_Columns), m_Values(_Rows * _Columns) {}
+
+	//!	@brief	Construct, resize and memory set.
+	explicit SeArray2D(size_t _Rows, size_t _Columns, Type _Value) : m_Rows(_Rows), m_Columns(_Columns), m_Values(_Rows * _Columns, _Value) {}
+
+public:
+
+	//! @brief  Data will be reserved in Column Major format
+	void Resize(size_t _Rows, size_t _Columns)
+	{
+		m_Values.resize(_Rows * _Columns);
+
+		m_Columns = _Columns;
+
+		m_Rows = _Rows;
+	}
+
+	//! @brief  release spare memory
+	void ShrinkToFit()
+	{
+		m_Values.shrink_to_fit();
+	}
+
+	//!	@brief	Fill data.
+	void Memset(Type _Value, size_t _Begin = 0, size_t _End = SIZE_MAX)
+	{
+		_End = _End < m_Values.size() ? _End : m_Values.size();
+
+		for (size_t i = _Begin; i < _End; ++i)
+		{
+			m_Values[i] = _Value;
+		}
+	}
+
+	//!	@brief	Exchange context with right.
+	void Swap(SeArray2D & _Right)
+	{
+		m_Values.swap(_Right.m_Values);
+
+		SE_SWAP(m_Columns, _Right.m_Columns);
+
+		SE_SWAP(m_Rows, _Right.m_Rows);
+	}
+
+	void Clear()
+	{
+		m_Values.clear();
+
+		m_Values.shrink_to_fit();
+
+		m_Rows = m_Columns = 0;
+	}
+
+public:
+
+	const Type * operator[](size_t i) const { SE_ASSERT(i < m_Rows);  return &m_Values[m_Columns * i]; }
+
+	Type * operator[](size_t i) { SE_ASSERT(i < m_Rows);  return &m_Values[m_Columns * i]; }
+
+	const Type * Ptr() const { return m_Values.data(); }
+
+	bool IsEmpty() const { return m_Values.empty(); }
+
+	size_t Size() const { return m_Values.size(); }
+
+	size_t Capacity() const { return m_Values.capacity(); }
+
+	size_t Columns() const { return m_Columns; }
+
+	size_t Rows() const { return m_Rows; }
+
+	Type * Ptr() { return m_Values.data(); }
+
+private:
+
+	std::vector<Type> m_Values;
+
+	size_t m_Rows, m_Columns;
+};
+
+SE_NAMESPACE_END
\ No newline at end of file
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeCollisionElements.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeCollisionElements.h
new file mode 100644
index 0000000..e5c6688
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeCollisionElements.h
@@ -0,0 +1,71 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "SeVectorSimd.h"
+
+SE_NAMESPACE_BEGIN
+
+struct EfSet
+{
+	int		m_eId;
+	int		m_fId;
+	float	stiff;
+	Float3	m_bary;			// (x, 1-x) / (y, z, 1-y-z): barycentric weight of the intersection point
+	SeVec3fSimd m_normal;	// repulsion direction
+};
+
+struct VfSet
+{
+	int		m_vId;
+	int		m_fId;
+	float	stiff;
+	Float2	m_bary;			// (x, y, 1-x-y): barycentric weight of the vertex
+	SeVec3fSimd m_normal;	// repulsion direction
+};
+
+struct EeSet
+{
+	int		m_eId0;
+	int		m_eId1;
+	float	stiff;
+	Float2	m_bary;			// (x, 1-x) / (y, 1-y): barycentric weight of the two closest points
+	SeVec3fSimd m_normal;	// repulsion direction
+};
+
+struct Stencil
+{
+	int verextNumPerStencil;
+	int vertexNumOfFirstPrimitive;
+
+	int		index[5];
+	float	weight[5];
+	float	stiff;
+	SeVec3fSimd direction;
+};
+
+SE_NAMESPACE_END
\ No newline at end of file
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeCsr.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeCsr.h
new file mode 100644
index 0000000..6149715
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeCsr.h
@@ -0,0 +1,188 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "SeOmp.h"
+#include "SeArray2D.h"
+#include "SeUtility.h"
+
+SE_NAMESPACE_BEGIN
+
+template<typename Type> class SeCompressSparseData
+{
+public:
+
+	SeCompressSparseData() {}
+	SeCompressSparseData(const std::vector<int>& starts, const std::vector<int>& idxs, const std::vector<Type>& values)
+		:
+		m_starts(starts),
+		m_idxs(idxs),
+		m_values(values)
+	{}
+
+	virtual ~SeCompressSparseData() {}
+
+	void InitIdxs(const std::vector<std::vector<int>> & array2D)
+	{
+		Clear();
+
+		int size = array2D.size();
+
+		m_starts.resize(size + 1); Utility::MemsetZero(m_starts);
+
+		for (int i = 0; i < size; ++i)
+		{
+			int num = array2D[i].size();
+
+			m_starts[i + 1] = m_starts[i] + num;
+		}
+
+		m_idxs.resize(m_starts.back());
+
+		OMP_PARALLEL_FOR
+
+		for (int i = 0; i < size; ++i)
+		{
+			int num = array2D[i].size();
+			std::memcpy(m_idxs.data() + m_starts[i], array2D[i].data(), sizeof(int) * num);
+		}
+	}
+
+	void InitIdxs(const SeArray2D<int> & array2D, bool isRowMajor)
+	{
+		Clear();
+
+		int DIM = isRowMajor ? array2D.Rows() : array2D.Columns();
+
+		m_starts.resize(DIM + 1); Utility::MemsetZero(m_starts);
+
+		for (int i = 0; i < DIM; ++i)
+		{
+			int num = isRowMajor ? array2D[i][0] : array2D[0][i];
+
+			m_starts[i + 1] = m_starts[i] + num;
+		}
+
+		m_idxs.resize(m_starts.back());
+
+		OMP_PARALLEL_FOR
+
+		for (int i = 0; i < DIM; ++i)
+		{
+			int * address = m_idxs.data() + m_starts[i];
+
+			int num = Size(i);
+
+			for (int k = 1; k <= num; ++k)
+			{
+				address[k - 1] = isRowMajor ? array2D[i][k] : array2D[k][i];
+			}
+		}
+	}
+
+	void Clear()
+	{
+		Utility::ClearAndShrink(m_starts);
+		Utility::ClearAndShrink(m_idxs);
+		Utility::ClearAndShrink(m_values);
+	}
+
+	int Size() const 
+	{
+		return m_starts.back();
+	}
+
+	int Size(int id) const
+	{
+		return -m_starts[id] + m_starts[id + 1];
+	}
+
+	int Start(int id) const 
+	{
+		return m_starts[id];
+	}
+
+	const int* StartPtr(int id) const 
+	{
+		return &(m_starts[id]);
+	}
+
+	const int Idx(int id) const
+	{
+		return m_idxs[id];
+	}
+
+	const int * IdxPtr(int id) const
+	{
+		return m_idxs.data() + m_starts[id];
+	}
+
+	const Type * ValuePtr(int id) const 
+	{
+		return m_values.data() + m_starts[id];
+	}
+
+	virtual const SeCompressSparseData * Ptr() const
+	{
+		return this;
+	}
+
+protected:
+
+	std::vector<int>	m_starts;
+	std::vector<int>	m_idxs;
+	std::vector<Type>	m_values;
+};
+
+template<typename Type> class SeCsr : public SeCompressSparseData<Type>
+{
+public:
+	SeCsr() {}
+	SeCsr(const std::vector<int>& starts, const std::vector<int>& idxs, const std::vector<Type>& values) :SeCompressSparseData<Type>(starts, idxs, values) {}
+
+	int Rows() const { return SeCompressSparseData<Type>::m_starts.size() - 1; }
+
+	virtual const SeCsr * Ptr() const override
+	{
+		return this;
+	}
+};
+
+
+template<typename Type> class SeCsc : public SeCompressSparseData<Type>
+{
+public:
+
+	int Columns() const { return SeCompressSparseData<Type>::m_starts.size() - 1; }
+
+	virtual const SeCsc * Ptr() const override
+	{
+		return this;
+	}
+};
+
+SE_NAMESPACE_END
\ No newline at end of file
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeIntrinsic.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeIntrinsic.h
new file mode 100644
index 0000000..370b9d3
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeIntrinsic.h
@@ -0,0 +1,157 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <atomic>
+#include <stdint.h>
+#include <intrin.h>
+
+#include "SeMatrix.h"
+
+SE_NAMESPACE_BEGIN
+
+namespace Intrinsic
+{
+	/*************************************************************************
+	**************************    Count of bits    ***************************
+	*************************************************************************/
+
+#ifdef WIN32
+    inline int Clz32(unsigned int Val) { return __lzcnt(Val); }
+    inline int Clz64(unsigned __int64 Val) { return __lzcnt64(Val); }
+    inline int Popcount32(unsigned int Val) { return __popcnt(Val); }
+    inline int Popcount64(unsigned __int64 Val) { return __popcnt64(Val); }
+
+    #pragma intrinsic(_BitScanForward)
+    inline int Ffs(int s)
+    {
+        unsigned long index = -1;
+        unsigned char isNonzero = _BitScanForward(&index, s);
+        /*s should be greater than 0; index is zero-based*/
+        return isNonzero ? (index + 1) : 0;
+    }
+
+#else
+    inline int Clz32(unsigned int Val) { return std::countl_zero(Val); }
+    inline int Clz64( uint64_t Val) { return std::countl_zero(Val); }
+    inline int Popcount32(unsigned int Val) { return std::popcount(Val); }
+    inline int Popcount64(uint64_t Val) { return std::popcount(Val); }
+    inline int Ffs(int s) { return ffs(s); }
+#endif
+
+	/*************************************************************************
+	************************    Atomic operations    *************************
+	*************************************************************************/
+
+
+	template<typename Type> inline Type AtomicOr(Type * Address, Type Val) { return reinterpret_cast<std::atomic<Type>*>(Address)->fetch_or(Val,std::memory_order_relaxed); }
+	template<typename Type> inline Type AtomicXor(Type * Address, Type Val) { return reinterpret_cast<std::atomic<Type>*>(Address)->fetch_xor(Val,std::memory_order_relaxed); }
+	template<typename Type> inline Type AtomicAnd(Type * Address, Type Val) { return reinterpret_cast<std::atomic<Type>*>(Address)->fetch_and(Val,std::memory_order_relaxed); }
+	template<typename Type> inline Type AtomicAdd(Type * Address, Type Val) { return reinterpret_cast<std::atomic<Type>*>(Address)->fetch_add(Val,std::memory_order_relaxed); }
+	template<typename Type> inline Type AtomicSub(Type * Address, Type Val) { return reinterpret_cast<std::atomic<Type>*>(Address)->fetch_sub(Val,std::memory_order_relaxed); }
+	template<typename Type> inline Type AtomicExch(Type * Address, Type Val) { return reinterpret_cast<std::atomic<Type>*>(Address)->exchange(Val,std::memory_order_relaxed); }
+	template<typename Type> inline Type AtomicCAS(Type* Address, Type Exp, Type Val) { reinterpret_cast<std::atomic<Type>*>(Address)->compare_exchange_strong(Exp, Val, std::memory_order_relaxed); return Exp; }
+
+	template< typename Type, typename Op > Type AtomicOperationByCas(Type* addr, Type value, Op op)
+	{
+		Type oldMyValue = *addr;
+		bool isChangedByOtherThread = true;
+		while (isChangedByOtherThread)
+		{
+			oldMyValue = *addr;
+			Type m = op(value, oldMyValue);
+			float newMyValue = AtomicCAS<Type>(addr, oldMyValue, m);
+			isChangedByOtherThread = oldMyValue != newMyValue;
+		}
+		return oldMyValue;
+	}
+
+
+	template<typename Type> inline Type AtomicMax(Type* addr, Type value) 
+	{ 
+		return AtomicOperationByCas<Type>(addr, value, [](const auto& a, const auto& b) { return a > b ? a : b; });
+	}
+
+	template<typename Type> inline Type AtomicMin(Type* addr, Type value) 
+	{ 
+		return AtomicOperationByCas<Type>(addr, value, [](const auto& a, const auto& b) { return a < b ? a : b; });
+	}
+
+#if defined(WIN32) && (_MSVC_LANG > 201703L) // only for C++ 20 on Windows
+	inline void AtomicAdd(SeMatrix3f* address, SeMatrix3f value)
+	{
+		for (int i = 0; i < 3; i++)
+        {
+			for (int j = 0; j < 3; j++)
+            {
+                    AtomicAdd(&(*address)(i, j), value(i, j));
+            }
+        }
+	}
+
+	template<typename Type, int Desc> inline void AtomicAdd(SeVector<Type, Desc> * address, SeVector<Type, Desc> value)
+	{
+		for (int i = 0; i < SeVector<Type, Desc>::Elements; i++)
+        {
+            AtomicAdd(&(*address)[i], value[i]);
+        }
+	}
+#else
+    inline void AtomicAdd(SeMatrix3f* address, SeMatrix3f value)
+    {
+        for (int i = 0; i < 3; i++)
+        {
+            for (int j = 0; j < 3; j++)
+            {
+            AtomicOperationByCas<float>( &(*address)(i, j), value(i, j), [](const auto& a, const auto& b) { return a + b; });
+            }
+        }
+    }
+
+    template<typename Type, int Desc> inline void AtomicAdd(SeVector<Type, Desc> * address, SeVector<Type, Desc> value)
+    {
+        for (int i = 0; i < SeVector<Type, Desc>::Elements; i++)
+        {
+            AtomicOperationByCas<float>(&(*address)[i], value[i], [](const auto& a, const auto& b) { return a + b; });
+        }
+    }
+
+#endif
+
+	/*************************************************************************
+	*************************    Warp operations    **************************
+	*************************************************************************/
+
+
+	 inline void SyncWarp() {}
+	 inline void SyncBlock() {}
+	 inline unsigned int ActiveMask();
+	 inline unsigned int LanemaskLt();
+	 inline unsigned int LanemaskLt(unsigned int laneId) { return  (1U << laneId) - 1; }
+}
+
+SE_NAMESPACE_END
\ No newline at end of file
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeMath.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeMath.h
new file mode 100644
index 0000000..2fab11f
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeMath.h
@@ -0,0 +1,187 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "SeVector.h"
+#include "SeMatrix.h"
+
+SE_NAMESPACE_BEGIN
+
+#define SE_E		2.71828182845904523536
+#define SE_PI		3.14159265358979323846
+#define SE_2_PI		6.28318530717958647692
+
+namespace Math
+{
+	/*********************************************************************
+	****************************    Helper    ****************************
+	*********************************************************************/
+
+	template<typename Type> struct Helper
+	{
+		using Vec2 = SeVector2<Type>;
+		using Vec3 = SeVector3<Type>;
+		using Vec4 = SeVector4<Type>;
+
+		template<typename... Types> using Function = Type(*)(Types...);
+
+		template<Function<Type> Fn1> static SE_INLINE Vec2 Expand(const Vec2 & a) { return Vec2(Fn1(a.x), Fn1(a.y)); }
+		template<Function<Type> Fn1> static SE_INLINE Vec3 Expand(const Vec3 & a) { return Vec3(Fn1(a.x), Fn1(a.y), Fn1(a.z)); }
+		template<Function<Type> Fn1> static SE_INLINE Vec4 Expand(const Vec4 & a) { return Vec4(Fn1(a.x), Fn1(a.y), Fn1(a.z), Fn1(a.w)); }
+
+		template<Function<Type, Type> Fn2> static SE_INLINE Vec2 Expand(const Vec2 & a, Type s) { return Vec2(Fn2(a.x, s), Fn2(a.y, s)); }
+		template<Function<Type, Type> Fn2> static SE_INLINE Vec3 Expand(const Vec3 & a, Type s) { return Vec3(Fn2(a.x, s), Fn2(a.y, s), Fn2(a.z, s)); }
+		template<Function<Type, Type> Fn2> static SE_INLINE Vec4 Expand(const Vec4 & a, Type s) { return Vec4(Fn2(a.x, s), Fn2(a.y, s), Fn2(a.z, s), Fn2(a.w, s)); }
+
+		template<Function<Type, Type> Fn2> static SE_INLINE Vec2 Expand(const Vec2 & a, const Vec2 & b) { return Vec2(Fn2(a.x, b.x), Fn2(a.y, b.y)); }
+		template<Function<Type, Type> Fn2> static SE_INLINE Vec3 Expand(const Vec3 & a, const Vec3 & b) { return Vec3(Fn2(a.x, b.x), Fn2(a.y, b.y), Fn2(a.z, b.z)); }
+		template<Function<Type, Type> Fn2> static SE_INLINE Vec4 Expand(const Vec4 & a, const Vec4 & b) { return Vec4(Fn2(a.x, b.x), Fn2(a.y, b.y), Fn2(a.z, b.z), Fn2(a.w, b.w)); }
+
+		template<Function<Type, Type, Type> Fn3> static SE_INLINE Vec2 Expand(const Vec2 & a, Type s0, Type s1) { return Vec2(Fn3(a.x, s0, s1), Fn3(a.y, s0, s1)); }
+		template<Function<Type, Type, Type> Fn3> static SE_INLINE Vec3 Expand(const Vec3 & a, Type s0, Type s1) { return Vec3(Fn3(a.x, s0, s1), Fn3(a.y, s0, s1), Fn3(a.z, s0, s1)); }
+		template<Function<Type, Type, Type> Fn3> static SE_INLINE Vec4 Expand(const Vec4 & a, Type s0, Type s1) { return Vec4(Fn3(a.x, s0, s1), Fn3(a.y, s0, s1), Fn3(a.z, s0, s1), Fn3(a.w, s0, s1)); }
+
+		template<Function<Type, Type, Type> Fn3> static SE_INLINE Vec2 Expand(const Vec2 & a, const Vec2 & b, Type scalar) { return Vec2(Fn3(a.x, b.x, scalar), Fn3(a.y, b.y, scalar)); }
+		template<Function<Type, Type, Type> Fn3> static SE_INLINE Vec3 Expand(const Vec3 & a, const Vec3 & b, Type scalar) { return Vec3(Fn3(a.x, b.x, scalar), Fn3(a.y, b.y, scalar), Fn3(a.z, b.z, scalar)); }
+		template<Function<Type, Type, Type> Fn3> static SE_INLINE Vec4 Expand(const Vec4 & a, const Vec4 & b, Type scalar) { return Vec4(Fn3(a.x, b.x, scalar), Fn3(a.y, b.y, scalar), Fn3(a.z, b.z, scalar), Fn3(a.w, b.w, scalar)); }
+	};
+
+	/*********************************************************************
+	*************************    Mathematics    **************************
+	*********************************************************************/
+	
+	SE_INLINE float Log(float x) { return logf(x);}
+	SE_INLINE float Abs(float a) { return fabs(a); }
+	SE_INLINE float Sin(float a) { return sinf(a); }
+	SE_INLINE float Cos(float a) { return cosf(a); }
+	SE_INLINE float Tan(float a) { return tanf(a); }
+	SE_INLINE float Exp(float a) { return expf(a); }
+	SE_INLINE float Asin(float a) { return asinf(a); }
+	SE_INLINE float Acos(float a) { return acosf(a); }
+	SE_INLINE float Atan(float a) { return atanf(a); }
+	SE_INLINE float Sqrt(float a) { return sqrtf(a); }
+	SE_INLINE float Cbrt(float x) { return cbrtf(x); }
+	SE_INLINE float Ceil(float a) { return ceilf(a); }
+	SE_INLINE float Recip(float a) { return 1.0f / a; }
+	SE_INLINE float Floor(float a) { return floorf(a); }
+	SE_INLINE float Round(float a) { return roundf(a); }
+	SE_INLINE float Pow(float x, float y) { return powf(x, y); }
+	SE_INLINE float Atan2(float y, float x) { return atan2f(y, x); }
+	SE_INLINE constexpr float Radians(float a) { return a * SE_SCF(SE_PI / 180.0); }
+
+	SE_INLINE float Rsqrt(float a) { return 1.0f / sqrtf(a); }
+
+	template<typename Type> SE_INLINE constexpr Type Square(Type a) { return a * a; }
+	template<typename Type> SE_INLINE constexpr Type Cubic(Type a) { return a * a * a; }
+	template<typename Type> SE_INLINE constexpr Type Min(Type a, Type b) { return SE_MIN(a, b); }
+	template<typename Type> SE_INLINE constexpr Type Max(Type a, Type b) { return SE_MAX(a, b); }
+	template<typename Type> SE_INLINE constexpr Type Sign(Type a) { return Type(a > 0) - Type(a < 0); }
+	template<typename Type> SE_INLINE constexpr Type Clamp(Type a, Type minVal, Type maxVal) { return Min(Max(minVal, a), maxVal); }
+
+	template<typename Type> SE_INLINE bool IsNan(Type a) { return false; }
+	template<typename Type> SE_INLINE bool IsInf(Type a) { return false; }
+	template<typename Type> SE_INLINE Type Sum(const SeVector2<Type> & a) { return a.x + a.y; }
+	template<typename Type> SE_INLINE Type Sum(const SeVector3<Type> & a) { return a.x + a.y + a.z; }
+	template<typename Type> SE_INLINE Type Sum(const SeVector4<Type> & a) { return a.x + a.y + a.z + a.w; }
+	template<typename Type> SE_INLINE Type Average(const Type & v0, const Type & v1) { return (v0 + v1) / 2; }
+	template<typename Type> SE_INLINE Type Average(const Type & v0, const Type & v1, const Type & v2) { return (v0 + v1 + v2) / 3; }
+	template<typename Type> SE_INLINE Type Lerp(const Type & a, const Type & b, float ratio) { return a * (1.0f - ratio) + b * ratio; }
+
+	template<typename Type, int Desc> SE_INLINE Type Dot(const SeVector<Type, Desc> & a, const SeVector<Type, Desc> & b) { return Sum(a * b); }
+	template<typename Type, int Desc> SE_INLINE SeVector<Type, Desc> Abs(const SeVector<Type, Desc> & a) { return Helper<Type>::Expand<Abs>(a); }
+	template<typename Type, int Desc> SE_INLINE SeVector<Type, Desc> Sign(const SeVector<Type, Desc> & a) { return Helper<Type>::Expand<Sign>(a); }
+	template<typename Type, int Desc> SE_INLINE SeVector<Type, Desc> Recip(const SeVector<Type, Desc> & a) { return Helper<Type>::Expand<Recip>(a); }
+	template<typename Type, int Desc> SE_INLINE SeVector<Type, Desc> Rsqrt(const SeVector<Type, Desc> & a) { return Helper<Type>::Expand<Rsqrt>(a); }
+	template<typename Type, int Desc> SE_INLINE SeVector<Type, Desc> Cubic(const SeVector<Type, Desc> & a) { return Helper<Type>::Expand<Cubic>(a); }
+	template<typename Type, int Desc> SE_INLINE SeVector<Type, Desc> Square(const SeVector<Type, Desc> & a) { return Helper<Type>::Expand<Square>(a); }
+	template<typename Type, int Desc> SE_INLINE SeVector<Type, Desc> Min(const SeVector<Type, Desc> & a, const SeVector<Type, Desc> & b) { return Helper<Type>::template Expand<Min<Type>>(a, b); }
+	template<typename Type, int Desc> SE_INLINE SeVector<Type, Desc> Max(const SeVector<Type, Desc> & a, const SeVector<Type, Desc> & b) { return  Helper<Type>::template Expand<Max<Type>>(a, b); }
+	template<typename Type, int Desc> SE_INLINE SeVector<Type, Desc> Clamp(const SeVector<Type, Desc> & a, Type minVal, Type maxVal)  { return Helper<Type>::Expand<Clamp>(a, minVal, maxVal); }
+	
+	template<typename Type, typename... Args> SE_INLINE Type Min(const Type & a, const Type & b, const Args &... args) { return Min(Min(a, b), args...); }
+	template<typename Type, typename... Args> SE_INLINE Type Max(const Type & a, const Type & b, const Args &... args) { return Max(Max(a, b), args...); }
+
+	SE_INLINE SeVector3<float> Cross(const SeVector2<float> & a, const SeVector2<float> & b) { return SeVector3<float>(0.0f, 0.0f, a.x * b.y - a.y * b.x); }
+	SE_INLINE SeVector3<float> Cross(const SeVector3<float> & a, const SeVector3<float> & b) { return SeVector3<float>(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); }
+	SE_INLINE SeVector4<float> Cross(const SeVector4<float> & a, const SeVector4<float> & b) { return SeVector4<float>(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f); }
+
+	template<int Desc> SE_INLINE float SqrLength(const SeVector<float, Desc> & a) { return Dot(a, a); }
+	template<int Desc> SE_INLINE float Length(const SeVector<float, Desc> & a) { return Sqrt(Dot(a, a)); }
+	template<int Desc> SE_INLINE float RelativeDistance(const SeVector<float, Desc>& a, const SeVector<float, Desc>& b) { float maxLength = Max(Length(b), Length(a)); if (maxLength == 0.f) return 0.f; else return Length(b - a) / maxLength; }
+	template<int Desc> SE_INLINE SeVector<float, Desc> Sin(const SeVector<float, Desc> & a) { return Helper<float>::Expand<Sin>(a); }
+	template<int Desc> SE_INLINE SeVector<float, Desc> Cos(const SeVector<float, Desc> & a) { return Helper<float>::Expand<Cos>(a); }
+	template<int Desc> SE_INLINE SeVector<float, Desc> Tan(const SeVector<float, Desc> & a) { return Helper<float>::Expand<Tan>(a); }
+	template<int Desc> SE_INLINE SeVector<float, Desc> Asin(const SeVector<float, Desc> & a) { return Helper<float>::Expand<Asin>(a); }
+	template<int Desc> SE_INLINE SeVector<float, Desc> Acos(const SeVector<float, Desc> & a) { return Helper<float>::Expand<Acos>(a); }
+	template<int Desc> SE_INLINE SeVector<float, Desc> Atan(const SeVector<float, Desc> & a) { return Helper<float>::Expand<Atan>(a); }
+	template<int Desc> SE_INLINE SeVector<float, Desc> Sqrt(const SeVector<float, Desc> & a) { return Helper<float>::Expand<Sqrt>(a); }
+	template<int Desc> SE_INLINE SeVector<float, Desc> Ceil(const SeVector<float, Desc> & a) { return Helper<float>::Expand<Ceil>(a); }
+	template<int Desc> SE_INLINE SeVector<float, Desc> Floor(const SeVector<float, Desc> & a) { return Helper<float>::Expand<Floor>(a); }
+	template<int Desc> SE_INLINE SeVector<float, Desc> Round(const SeVector<float, Desc> & a) { return Helper<float>::Expand<Round>(a); }
+	template<int Desc> SE_INLINE SeVector<float, Desc> Radians(const SeVector<float, Desc> & a) { return Helper<float>::Expand<Radians>(a); }
+	template<int Desc> SE_INLINE float TripleProduct(const SeVector<float, Desc> & a, const SeVector<float, Desc> & b, const SeVector<float, Desc> & c) { return Dot(a, Cross(b, c)); }
+
+	//!	������aΪNaN��0������0.0f��
+	template<int Desc> SE_INLINE SeVector<float, Desc> Normalize(const SeVector<float, Desc> & a)
+	{
+		float sqrLen = SqrLength(a);		if (sqrLen > 0.0f) { return a * Rsqrt(sqrLen); }
+		
+	//	printf("Normalizing an invalid vector: (%f, %f, %f)!\n", a.x, a.y, a.z);
+
+		return SeVector<float, Desc>(0.0f);
+	}
+
+	template<int Desc> SE_INLINE bool Normalized(SeVector<float, Desc> & a)
+	{
+		float sqrLen = SqrLength(a);
+		if (sqrLen > 0.f) 
+		{ 
+			a *= Rsqrt(sqrLen); 
+			return true;
+		}
+		return false;
+	}
+	
+	template<> SE_INLINE bool IsNan(float a) { return isnan(a); }
+	template<> SE_INLINE bool IsNan(double a) { return isnan(a); }
+	template<> SE_INLINE bool IsNan(SeVector2<float> a) { return isnan(a.x) || isnan(a.y); }
+	template<> SE_INLINE bool IsNan(SeVector3<float> a) { return isnan(a.x) || isnan(a.y) || isnan(a.z); }
+	template<> SE_INLINE bool IsNan(SeVector4<float> a) { return isnan(a.x) || isnan(a.y) || isnan(a.z) || isnan(a.w); }
+	template<> SE_INLINE bool IsNan(SeMatrix3f a) { return IsNan(a.Column(0)) || IsNan(a.Column(1)) || IsNan(a.Column(2)); }
+
+	template<> SE_INLINE bool IsInf(float a) { return isinf(a); }
+	template<> SE_INLINE bool IsInf(double a) { return isinf(a); }
+	template<> SE_INLINE bool IsInf(SeVector2<float> a) { return isinf(a.x) || isinf(a.y); }
+	template<> SE_INLINE bool IsInf(SeVector3<float> a) { return isinf(a.x) || isinf(a.y) || isinf(a.z); }
+	template<> SE_INLINE bool IsInf(SeVector4<float> a) { return isinf(a.x) || isinf(a.y) || isinf(a.z) || isinf(a.w); }
+	template<> SE_INLINE bool IsInf(SeMatrix3f a) { return IsInf(a.Column(0)) || IsInf(a.Column(1)) || IsInf(a.Column(2)); }
+};
+
+SE_NAMESPACE_END
+
+#undef min
+#undef max
\ No newline at end of file
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeMathSimd.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeMathSimd.h
new file mode 100644
index 0000000..a7e7cca
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeMathSimd.h
@@ -0,0 +1,134 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "SeVectorSimd.h"
+
+#include "SeMath.h"
+
+SE_NAMESPACE_BEGIN
+
+namespace Math
+{
+	template<typename Type> struct HelperSimd
+	{
+		using Smv3 = SimdVector3<Type>;
+
+		template<typename... Types> using Function = Type(*)(Types...);
+
+		template<Function<Type> Fn1> static SE_INLINE				Smv3 Expand(const Smv3& a) { return Smv3(Fn1(a.x), Fn1(a.y), Fn1(a.z)); }
+		
+		template<Function<Type, Type> Fn2> static SE_INLINE			Smv3 Expand(const Smv3& a, Type s) { return Smv3(Fn2(a.x, s), Fn2(a.y, s), Fn2(a.z, s)); }
+		
+		template<Function<Type, Type> Fn2> static SE_INLINE			Smv3 Expand(const Smv3& a, const Smv3& b) { return Smv3(Fn2(a.x, b.x), Fn2(a.y, b.y), Fn2(a.z, b.z)); }
+		
+		template<Function<Type, Type, Type> Fn3> static SE_INLINE	Smv3 Expand(const Smv3& a, Type s0, Type s1) { return Smv3(Fn3(a.x, s0, s1), Fn3(a.y, s0, s1), Fn3(a.z, s0, s1)); }
+		
+		template<Function<Type, Type, Type> Fn3> static SE_INLINE	Smv3 Expand(const Smv3& a, const Smv3& b, Type scalar) { return Smv3(Fn3(a.x, b.x, scalar), Fn3(a.y, b.y, scalar), Fn3(a.z, b.z, scalar)); }
+	};
+
+	template<typename Type> SE_INLINE SimdVector3<Type> Abs(const SimdVector3<Type>& a) { return HelperSimd<Type>::Expand<Abs>(a); }
+	template<typename Type> SE_INLINE SimdVector3<Type> Sign(const SimdVector3<Type>& a) { return HelperSimd<Type>::Expand<Sign>(a); }
+	template<typename Type> SE_INLINE SimdVector3<Type> Recip(const SimdVector3<Type>& a) { return HelperSimd<Type>::Expand<Recip>(a); }
+	template<typename Type> SE_INLINE SimdVector3<Type> Rsqrt(const SimdVector3<Type>& a) { return HelperSimd<Type>::Expand<Rsqrt>(a); }
+	template<typename Type> SE_INLINE SimdVector3<Type> Cubic(const SimdVector3<Type>& a) { return HelperSimd<Type>::Expand<Cubic>(a); }
+	template<typename Type> SE_INLINE SimdVector3<Type> Square(const SimdVector3<Type>& a) { return HelperSimd<Type>::Expand<Square>(a); }
+	template<typename Type> SE_INLINE SimdVector3<Type> Min(const SimdVector3<Type>& a, const SimdVector3<Type>& b) { return HelperSimd<Type>::Expand<Min>(a, b); }
+	template<typename Type> SE_INLINE SimdVector3<Type> Max(const SimdVector3<Type>& a, const SimdVector3<Type>& b) { return HelperSimd<Type>::Expand<Max>(a, b); }
+	template<typename Type> SE_INLINE SimdVector3<Type> Clamp(const SimdVector3<Type>& a, Type minVal, Type maxVal) { return HelperSimd<Type>::Expand<Clamp>(a, minVal, maxVal); }
+
+	
+	template<> SE_INLINE SimdVector3<float> Sin(const SimdVector3<float>& a) { return HelperSimd<float>::Expand<Sin>(a); }
+	template<> SE_INLINE SimdVector3<float> Cos(const SimdVector3<float>& a) { return HelperSimd<float>::Expand<Cos>(a); }
+	template<> SE_INLINE SimdVector3<float> Tan(const SimdVector3<float>& a) { return HelperSimd<float>::Expand<Tan>(a); }
+	template<> SE_INLINE SimdVector3<float> Asin(const SimdVector3<float>& a) { return HelperSimd<float>::Expand<Asin>(a); }
+	template<> SE_INLINE SimdVector3<float> Acos(const SimdVector3<float>& a) { return HelperSimd<float>::Expand<Acos>(a); }
+	template<> SE_INLINE SimdVector3<float> Atan(const SimdVector3<float>& a) { return HelperSimd<float>::Expand<Atan>(a); }
+	template<> SE_INLINE SimdVector3<float> Ceil(const SimdVector3<float>& a) { return HelperSimd<float>::Expand<Ceil>(a); }
+	template<> SE_INLINE SimdVector3<float> Floor(const SimdVector3<float>& a) { return HelperSimd<float>::Expand<Floor>(a); }
+	template<> SE_INLINE SimdVector3<float> Round(const SimdVector3<float>& a) { return HelperSimd<float>::Expand<Round>(a); }
+	template<> SE_INLINE SimdVector3<float> Radians(const SimdVector3<float>& a) { return HelperSimd<float>::Expand<Radians>(a); }
+
+	/*********************************************************************
+	*************************    Mathematics    **************************
+	*********************************************************************/
+
+	template<typename Type> SE_INLINE Type Sum(const SimdVector3<Type> & a) { return a.x + a.y + a.z; }
+	template<typename Type> SE_INLINE Type Dot(const SimdVector3<Type> & a, const SimdVector3<Type> & b) { return Sum(a * b); }
+	
+	template<> SE_INLINE bool	IsNan(const Simd3f & a) { return isnan(a.x) || isnan(a.y) || isnan(a.z) || isnan(a.w); }
+	template<> SE_INLINE bool	IsInf(const Simd3f & a) { return isinf(a.x) || isinf(a.y) || isinf(a.z) || isinf(a.w); }
+
+	template<> SE_INLINE Simd3f Recip(const Simd3f & a) { return Simd3f(_mm_rcp_ps(a.pack)); }
+	template<> SE_INLINE Simd3f Sqrt(const Simd3f & a) { return Simd3f(_mm_sqrt_ps(a.pack)); }
+	template<> SE_INLINE Simd3f Rsqrt(const Simd3f & a) { return Simd3f(_mm_rsqrt_ps(a.pack)); }
+	template<> SE_INLINE Simd3f Min(const Simd3f & a, const Simd3f & b) { return Simd3f(_mm_min_ps(a.pack, b.pack)); }
+	template<> SE_INLINE Simd3f Max(const Simd3f & a, const Simd3f & b) { return Simd3f(_mm_max_ps(a.pack, b.pack)); }
+
+	template<> SE_INLINE float  SqrLength(const Simd3f& a) { return Dot(a, a); }
+	template<> SE_INLINE float  Length(const Simd3f& a) { return Sqrt(Dot(a, a)); }
+	template<> SE_INLINE float  RelativeDistance(const Simd3f& a, const Simd3f& b) { float maxLength = Max(Length(b), Length(a)); if (maxLength == 0.f) return 0.f; else return Length(b - a) / maxLength; }
+
+	template<> SE_INLINE Simd3f Normalize(const Simd3f & a)
+	{
+		float sqrLength = Math::SqrLength(a);
+		return sqrLength > 0.f ? a * Rsqrt(Simd3f(sqrLength, sqrLength, sqrLength, 1.f)) : Simd3f(0.0f);
+	}
+
+	template<> SE_INLINE bool Normalized(Simd3f & a)
+	{
+		float sqrLength = Math::SqrLength(a);
+		if (sqrLength > 0.f)
+		{
+			a *= Rsqrt(Simd3f(sqrLength, sqrLength, sqrLength, 1.f));
+			return true;
+		}
+		a = Simd3f(0.f);
+		return false;
+	}
+
+	SE_INLINE Simd3f Cross(const Simd3f & a, const Simd3f & b)
+	{
+		__m128 t0 = _mm_shuffle_ps(a.pack, a.pack, _MM_SHUFFLE(3, 0, 2, 1));	//!	(y0, z0, x0, 0)
+		__m128 t1 = _mm_shuffle_ps(b.pack, b.pack, _MM_SHUFFLE(3, 1, 0, 2));	//!	(z1, x1, y1, 0)
+		
+		__m128 t2 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 0, 2, 1));			//!	(z0, x0, y0, 0)
+		__m128 t3 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 1, 0, 2));			//!	(y1, z1, x1, 0)
+
+		return _mm_sub_ps(_mm_mul_ps(t0, t1), _mm_mul_ps(t2, t3));
+	}
+
+	template<> SE_INLINE float TripleProduct(const SimdVector3<float>& a, const SimdVector3<float>& b, const SimdVector3<float>& c)
+	{ 
+		return Dot(a, Cross(b, c)); 
+	}
+};
+
+SE_NAMESPACE_END
+
+#undef min
+#undef max
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeMatrix.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeMatrix.h
new file mode 100644
index 0000000..8570f95
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeMatrix.h
@@ -0,0 +1,1400 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "SeVector.h"
+
+
+SE_NAMESPACE_BEGIN
+
+
+template<typename Type, int N, int M> class SeMatrix
+{
+
+public:
+
+	static constexpr int nRows = N;
+	static constexpr int nCols = M;
+	static constexpr int nElements = N * M;
+	
+private:
+
+	union
+	{
+		Type				m_data[nElements];
+
+		SeVector<Type, N>	m_columns[M];
+	};
+
+	 const SeMatrix & Self() const { return *this; }
+
+public:
+	
+	 SeMatrix() {}
+
+	 SeMatrix(const Type* d) 
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] = d[i];
+		}
+	}
+
+	 explicit SeMatrix(const Type & rhs)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] = rhs;
+		}
+	}
+
+	 const	Type & operator() (int i, int j)	const	{ return m_data[j*nRows + i]; }
+			Type & operator() (int i, int j)			{ return m_data[j*nRows + i]; }
+
+
+	 SeMatrix operator - () const
+	{
+		SeMatrix rhs(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			rhs.m_data[i] = -m_data[i];
+		}
+		return rhs;
+	}
+
+	 SeMatrix operator + (const SeMatrix & rhs) const	
+	{																	
+		SeMatrix out(Type(0));											
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] + rhs.m_data[i];
+		}
+		return out;
+	}
+	
+	 SeMatrix operator - (const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] - rhs.m_data[i];
+		}
+		return out;
+	}
+
+	 SeMatrix & operator += (const SeMatrix & rhs)
+	{															
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] += rhs.m_data[i];
+		}
+		return *this;							
+	}
+
+	 SeMatrix & operator -= (const SeMatrix & rhs)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] -= rhs.m_data[i];
+		}
+		return *this;
+	}
+
+	
+	 SeMatrix operator * (const Type & rhs) const
+	{															
+		SeMatrix<Type, N, M> out(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] * rhs;
+		}
+		return out;
+	}
+
+	 SeMatrix & operator *= (const Type & rhs)
+	{													
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] *= rhs;
+		}
+		return *this;		
+	}
+
+	 SeVector<Type, N> operator * (const SeVector<Type, M> & rhs) const
+	{
+		SeVector<Type, N> ans(Type(0));
+		for (int i = 0; i < N; ++i)
+		{
+			Type s(0);
+			for (int j = 0; j < M; ++j)
+			{
+				s += Self()(i, j) * rhs[j];
+			}
+			ans[i] = s;
+		}
+		return ans;
+	}
+
+	template<int K>  SeMatrix<Type, N, K> operator * (const SeMatrix<Type, M, K>& rhs) const
+	{
+		SeMatrix<Type, N, K> ans(Type(0));
+		for (int i = 0; i < N; ++i)
+		{
+			for (int k = 0; k < K; ++k)
+			{
+				Type s(0);
+				for (int j = 0; j < M; ++j)
+				{
+					s += Self()(i, j) * rhs(j, k);
+				}
+				ans(i, k) = s;
+			}
+		}
+		return ans;
+	}
+
+
+	 SeMatrix P2PProduct(const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (size_t i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] * rhs.m_data[i];
+		}
+		return out;
+	}
+
+	 SeMatrix P2PDivide(const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] / rhs.m_data[i];
+		}
+		return out;
+	}
+
+	template<int K, int L>  SeMatrix<Type, N*K, M*L> KroneckerProduct(const SeMatrix<Type, K, L>& rhs) const
+	{
+		SeMatrix<Type, N*K, M*L> ans(Type(0));
+		for (int i = 0; i < N; ++i)
+		{
+			for (int j = 0; j < M; ++j)
+			{
+				int rStart = i * K;
+				int cStart = j * L;
+
+				Type ij = Self()(i, j);
+
+				for (int p = 0; p < K; ++p)
+				{
+					for (int q = 0; q < L; ++q)
+					{
+						ans(rStart + p, cStart + q) = ij * rhs(p, q);
+					}
+				}
+			}
+		}
+	}
+
+
+	 SeVector<Type, M> Row(int rId) const
+	{
+		SeVector<Type, M> x;
+		for (int k = 0; k < M; ++k)
+		{
+			x[k] = Self()(rId, k);
+		}
+		return x;
+	}
+
+	 SeVector<Type, N> Column(int cId) const
+	{
+		return m_columns[cId];
+	}
+
+	template<int K, int L>  SeMatrix<Type, K, L> Block(int row, int col) const
+	{
+		SeMatrix<Type, K, L> ans(Type(0));
+		for (int i = 0; i < K; ++i)
+		{
+			for (int j = 0; j < L; ++j)
+			{
+				ans(i, j) = Self()(i + row, j + col);
+			}
+		}
+		return ans;
+	}
+
+	 SeVector<Type, SE_MIN(N, M)> Diagonal()const
+	{
+		SeVector<Type, SE_MIN(N, M)> x;
+		for (int i = 0; i < SE_MIN(N, M); ++i)
+		{
+			x[i] = Self()(i, i);
+		}
+		return x;
+	}
+
+	 void SetRow(const SeVector<Type, M> & row, int rId)
+	{
+		for (int k = 0; k < M; ++k)
+		{
+			m_data[rId + k * nRows] = row[k];
+		}
+	}
+
+	 void SetColumn(const SeVector<Type, N> & column, int cId)
+	{
+		m_columns[cId] = column;
+	}
+
+	template<int K, int L>  void SetBlock(const SeMatrix<Type, K, L> & block, int rowStart, int colStart)
+	{
+		for (int i = 0; i < K; ++i)
+		{
+			for (int j = 0; j < L; ++j)
+			{
+				Self()(i + rowStart, j + colStart) = block(i, j);
+			}
+		}
+	}
+
+	 void SetDiagonal(const SeVector<Type, SE_MIN(N, M)> & diagonal)
+	{
+		for (int i = 0; i < N; ++i)
+		{
+			for (int j = 0; j < M; ++j)
+			{
+				Self()(i, j) = (i == j) ? diagonal[i] : Type(0);
+			}
+		}
+	}
+
+	 SeMatrix<Type, M, N> Transpose() const
+	{
+		SeMatrix<Type, M, N> trans(Type(0));
+		for (int i = 0; i < N; ++i)
+		{
+			for (int j = 0; j < M; ++j)
+			{
+				trans(j, i) = Self()(i, j);
+			}
+		}
+		return trans;
+	}
+
+	static  SeMatrix Identity(Type diagonal)
+	{
+		SeMatrix identity = SeMatrix(Type(0));
+		for (int i = 0; i < SE_MIN(N, M); ++i)
+		{
+			identity(i, i) = diagonal;
+		}
+		return identity;
+	}
+
+
+
+	 Type FrobeniusNorm() const
+	{
+		Type sum(0);
+		for (int i = 0; i < nElements; ++i)
+		{
+			sum += m_data[i] * m_data[i];
+		}
+		return std::sqrt(sum);
+	}
+
+	 Type Trace() const 
+	{
+		Type trace(0);
+		for (int i = 0; i < SE_MIN(N, M); ++i)
+		{
+			trace += Self()(i, i);
+		}
+		return trace;
+	}
+
+};
+
+template<typename Type, int N, int M> 
+SE_INLINE SeMatrix<Type, N, M> operator * (const Type & scalar, const SeMatrix<Type, N, M> & mat)
+{
+	return mat * scalar;
+}
+
+template<typename Type, int M, int N>
+SE_INLINE SeMatrix<Type, M, N> OuterProduct(const SeVector<Type, M>& vecM, const SeVector<Type, N>& vecN)
+{
+	SeMatrix<Type, M, N> ans(Type(0));
+	for (int i = 0; i < M; ++i)
+	{
+		for (int j = 0; j < N; ++j)
+		{
+			ans(i, j) = vecM[i] * vecN[j];
+		}
+	}
+	return ans;
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////
+
+
+//template<typename Type> class _declspec(align(16)) SeMatrix<Type, 2, 2>
+template<typename Type> class SE_ALIGN(16) SeMatrix<Type,2,2>
+{
+
+public:
+
+	static constexpr int DIM = 2;
+
+	static constexpr int nElements = DIM * DIM;
+
+private:
+
+	union
+	{
+		Type			m_data[nElements]; 
+		
+		SeVector2<Type> m_columns[DIM]; 
+	};
+
+	 const SeMatrix & Self() const { return *this; }
+
+public:
+	
+	 SeMatrix() {}
+
+	 explicit SeMatrix(const Type & v)
+	{
+		m_data[0] = v; m_data[1] = v; m_data[2] = v; m_data[3] = v;
+	}
+
+	 SeMatrix(const Type & v0, const Type & v1, const Type & v2, const Type & v3)
+	{
+		m_data[0] = v0;	m_data[1] = v1; m_data[2] = v2;	m_data[3] = v3;
+	}
+
+
+	 const	Type & operator() (int i, int j)	const	{ return m_data[j*DIM + i]; }
+			Type & operator() (int i, int j)			{ return m_data[j*DIM + i]; }
+
+	 SeMatrix operator - () const
+	{
+		SeMatrix rhs(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			rhs.m_data[i] = -m_data[i];
+		}
+		return rhs;
+	}
+
+	 SeMatrix operator + (const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] + rhs.m_data[i];
+		}
+		return out;
+	}
+
+	 SeMatrix operator - (const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] - rhs.m_data[i];
+		}
+		return out;
+	}
+
+	 SeMatrix & operator += (const SeMatrix & rhs)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] += rhs.m_data[i];
+		}
+		return *this;
+	}
+
+	 SeMatrix & operator -= (const SeMatrix & rhs)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] -= rhs.m_data[i];
+		}
+		return *this;
+	}
+
+
+	 SeMatrix operator * (const Type & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] * rhs;
+		}
+		return out;
+	}
+
+	 SeMatrix & operator *= (const Type & rhs)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] *= rhs;
+		}
+		return *this;
+	}
+
+	 SeVector<Type, DIM> operator * (const SeVector<Type, DIM> & rhs) const
+	{
+		SeVector<Type, DIM> ans(Type(0));
+		for (int i = 0; i < DIM; ++i)
+		{
+			Type s(0);
+			for (int j = 0; j < DIM; ++j)
+			{
+				s += Self()(i, j) * rhs[j];
+			}
+			ans[i] = s;
+		}
+		return ans;
+	}
+
+	template<int K>  SeMatrix<Type, DIM, K> operator * (const SeMatrix<Type, DIM, K>& rhs) const
+	{
+		SeMatrix<Type, DIM, K> ans(Type(0));
+		for (int i = 0; i < DIM; ++i)
+		{
+			for (int k = 0; k < K; ++k)
+			{
+				Type s(0);
+				for (int j = 0; j < DIM; ++j)
+				{
+					s += Self()(i, j) * rhs(j, k);
+				}
+				ans(i, k) = s;
+			}
+		}
+		return ans;
+	}
+
+
+	 SeMatrix P2PProduct(const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (size_t i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] * rhs.m_data[i];
+		}
+		return out;
+	}
+
+	 SeMatrix P2PDivide(const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] / rhs.m_data[i];
+		}
+		return out;
+	}
+
+	template<int K, int L>  SeMatrix<Type, DIM*K, DIM*L> KroneckerProduct(const SeMatrix<Type, K, L>& rhs) const
+	{
+		SeMatrix<Type, DIM*K, DIM*L> ans(Type(0));
+		for (int i = 0; i < DIM; ++i)
+		{
+			for (int j = 0; j < DIM; ++j)
+			{
+				int rStart = i * K;
+				int cStart = j * L;
+
+				Type ij = Self()(i, j);
+
+				for (int p = 0; p < K; ++p)
+				{
+					for (int q = 0; q < L; ++q)
+					{
+						ans(rStart + p, cStart + q) = ij * rhs(p, q);
+					}
+				}
+			}
+		}
+	}
+
+
+	 SeVector<Type, DIM> Row(int rId) const
+	{
+		SeVector<Type, DIM> x;
+		x[0] = m_data[rId];
+		x[1] = m_data[rId + DIM];
+		return x;
+	}
+
+	 SeVector<Type, DIM> Column(int cId) const
+	{
+		return m_columns[cId];
+	}
+
+	 SeVector<Type, DIM> Diagonal()const
+	{
+		SeVector<Type, DIM> x;
+		x[0] = m_data[0];
+		x[1] = m_data[3];
+		return x;
+	}
+
+	 void SetRow(const SeVector<Type, DIM> & row, int rId)
+	{
+		m_data[rId]		= row[0];
+		m_data[rId + 2] = row[1];
+	}
+
+	 void SetColumn(const SeVector<Type, DIM> & column, int cId)
+	{
+		m_columns[cId] = column;
+	}
+
+	 void SetDiagonal(const SeVector<Type, DIM> & diagonal)
+	{
+		m_data[0] = diagonal[0]; m_data[1] = 0;
+		m_data[2] = 0; m_data[3] = diagonal[1];
+	}
+
+	 SeMatrix Transpose() const
+	{
+		SeMatrix trans = Self();
+		trans(0, 1) = Self()(1, 0);
+		trans(1, 0) = Self()(0, 1);
+		return trans;
+	}
+
+	static  SeMatrix Identity(Type diagonal = Type(1))
+	{
+		SeMatrix identity(Type(0));
+		identity(0, 0) = diagonal;
+		identity(1, 1) = diagonal;
+		return identity;
+	}
+
+	 Type FrobeniusNorm() const
+	{
+		Type sum(0);
+		for (int i = 0; i < nElements; ++i)
+		{
+			sum += m_data[i] * m_data[i];
+		}
+		return std::sqrt(sum);
+	}
+
+	 Type Trace() const
+	{	
+		return m_data[0] + m_data[3];
+	}
+
+	 Type Det() const
+	{
+		return 	m_data[0] * m_data[3] - m_data[1] * m_data[2];
+	}
+
+	 void Inverse(SeMatrix & result) const
+	{
+		Type dt = 1 / Det();
+
+		result(0, 0) = dt * Self()(1, 1);
+		result(1, 0) = -dt * Self()(1, 0);
+		result(0, 1) = -dt * Self()(0, 1);
+		result(1, 1) = dt * Self()(0, 0);
+	}
+
+	 SeMatrix Inverse() const
+	{
+		SeMatrix r(Self());
+		Inverse(r);
+		return r;
+	}
+};
+
+
+
+template<typename Type> class SeMatrix<Type, 3, 3>
+{
+
+public:
+
+	static constexpr int DIM = 3;
+
+	static constexpr int nElements = DIM * DIM;
+
+private:
+
+	union
+	{
+		Type			m_data[nElements];
+		SeVector3<Type> m_columns[DIM];
+	};
+
+	 const SeMatrix & Self() const { return *this; }
+
+public:
+
+	 SeMatrix() {}
+
+	 explicit SeMatrix(const Type & v)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] = v;
+		}
+	}
+
+	 const	Type & operator() (int i, int j) const	{ return m_data[j*DIM + i]; }
+			Type & operator() (int i, int j)		{ return m_data[j*DIM + i]; }
+
+	 SeMatrix operator - () const
+	{
+		SeMatrix rhs(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			rhs.m_data[i] = -m_data[i];
+		}
+		return rhs;
+	}
+
+	 SeMatrix operator + (const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] + rhs.m_data[i];
+		}
+		return out;
+	}
+
+	 SeMatrix operator - (const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] - rhs.m_data[i];
+		}
+		return out;
+	}
+
+	 SeMatrix & operator += (const SeMatrix & rhs)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] += rhs.m_data[i];
+		}
+		return *this;
+	}
+
+	 SeMatrix & operator -= (const SeMatrix & rhs)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] -= rhs.m_data[i];
+		}
+		return *this;
+	}
+
+	 void operator = (const SeMatrix & rhs)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] = rhs.m_data[i];
+		}
+	}
+
+
+	 SeMatrix operator * (const Type & rhs) const
+	{
+		SeMatrix out(Type(0));
+
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] * rhs;
+		}
+
+		return out;
+	}
+
+	 SeMatrix operator / (const Type & rhs) const
+	{
+		SeMatrix out(Type(0));
+
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] / rhs;
+		}
+
+		return out;
+	}
+
+	 SeMatrix & operator *= (const Type & rhs)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] *= rhs;
+		}
+
+		return *this;
+	}
+
+	 SeMatrix & operator /= (const Type & rhs)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] /= rhs;
+		}
+
+		return *this;
+	}
+
+	 SeVector<Type, DIM> operator * (const SeVector<Type, DIM> & rhs) const
+	{
+		SeVector<Type, DIM> ans(Type(0));
+		for (int i = 0; i < DIM; ++i)
+		{
+			Type s(0);
+			for (int j = 0; j < DIM; ++j)
+			{
+				s += Self()(i, j) * rhs[j];
+			}
+			ans[i] = s;
+		}
+		return ans;
+	}
+
+	template<int K>  SeMatrix<Type, DIM, K> operator * (const SeMatrix<Type, DIM, K>& rhs) const
+	{
+		SeMatrix<Type, DIM, K> ans(Type(0));
+		for (int i = 0; i < DIM; ++i)
+		{
+			for (int k = 0; k < K; ++k)
+			{
+				Type s(0);
+				for (int j = 0; j < DIM; ++j)
+				{
+					s += Self()(i, j) * rhs(j, k);
+				}
+				ans(i, k) = s;
+			}
+		}
+		return ans;
+	}
+
+
+	 SeMatrix P2PProduct(const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (size_t i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] * rhs.m_data[i];
+		}
+		return out;
+	}
+
+	 SeMatrix P2PDivide(const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] / rhs.m_data[i];
+		}
+		return out;
+	}
+
+	template<int K, int L>  SeMatrix<Type, DIM*K, DIM*L> KroneckerProduct(const SeMatrix<Type, K, L>& rhs) const
+	{
+		SeMatrix<Type, DIM*K, DIM*L> ans(Type(0));
+		for (int i = 0; i < DIM; ++i)
+		{
+			for (int j = 0; j < DIM; ++j)
+			{
+				int rStart = i * K;
+				int cStart = j * L;
+
+				Type ij = Self()(i, j);
+
+				for (int p = 0; p < K; ++p)
+				{
+					for (int q = 0; q < L; ++q)
+					{
+						ans(rStart + p, cStart + q) = ij * rhs(p, q);
+					}
+				}
+			}
+		}
+	}
+
+
+	 SeVector<Type, DIM> Row(int rId) const
+	{
+		SeVector<Type, DIM> x;
+		x[0] = m_data[rId];
+		x[1] = m_data[rId + 3];
+		x[2] = m_data[rId + 6];
+		return x;
+	}
+
+	 SeVector<Type, DIM> Column(int cId) const
+	{
+		return m_columns[cId];
+	}
+
+	 SeVector<Type, DIM> Diagonal()const
+	{
+		SeVector<Type, DIM> x;
+		x[0] = m_data[0];
+		x[1] = m_data[4];
+		x[2] = m_data[8];
+		return x;
+	}
+
+	 void SetRow(const SeVector<Type, DIM> & row, int rId)
+	{
+		m_data[rId] = row[0];
+		m_data[rId + 3] = row[1];
+		m_data[rId + 6] = row[2];
+	}
+
+	 void SetColumn(const SeVector<Type, DIM> & column, int cId)
+	{
+		m_columns[cId] = column;
+	}
+
+	 void SetDiagonal(const SeVector<Type, DIM> & diagonal)
+	{
+		Type zero(0);
+
+		m_data[0] = diagonal[0];	m_data[1] = zero;			m_data[2] = zero;
+		m_data[3] = zero;			m_data[4] = diagonal[1];	m_data[5] = zero;
+		m_data[6] = zero;			m_data[7] = zero;			m_data[8] = diagonal[2];
+	}
+
+	 SeMatrix Transpose() const
+	{
+		SeMatrix trans(Type(0));
+		for (int i = 0; i < DIM; ++i)
+		{
+			for (int j = 0; j < DIM; ++j)
+			{
+				trans(i, j) = Self()(j, i);
+			}
+		}
+		return trans;
+	}
+
+	static  SeMatrix Identity(Type diagonal = Type(1))
+	{
+		SeMatrix identity(Type(0));
+		identity(0, 0) = diagonal;
+		identity(1, 1) = diagonal;
+		identity(2, 2) = diagonal;
+		return identity;
+	}
+
+	 Type FrobeniusNorm() const
+	{
+		Type sum(0);
+		for (int i = 0; i < nElements; ++i)
+		{
+			sum += m_data[i] * m_data[i];
+		}
+		return std::sqrt(sum);
+	}
+
+	 Type Trace() const
+	{
+		return m_data[0] + m_data[4] + m_data[8];
+	}
+
+	 Type Det() const
+	{
+		return 	m_data[0] * (m_data[4] * m_data[8] - m_data[7] * m_data[5])
+			- m_data[3] * (m_data[1] * m_data[8] - m_data[7] * m_data[2])
+			+ m_data[6] * (m_data[1] * m_data[5] - m_data[4] * m_data[2]);
+	}
+
+	 void Inverse(SeMatrix & result) const
+	{
+		Type dt = 1 / Det();
+
+		result(0, 0) = dt * (Self()(1, 1) * Self()(2, 2) - Self()(1, 2) * Self()(2, 1));
+		result(1, 0) = dt * (Self()(1, 2) * Self()(2, 0) - Self()(1, 0) * Self()(2, 2));
+		result(2, 0) = dt * (Self()(1, 0) * Self()(2, 1) - Self()(1, 1) * Self()(2, 0));
+
+		result(0, 1) = dt * (Self()(0, 2) * Self()(2, 1) - Self()(0, 1) * Self()(2, 2));
+		result(1, 1) = dt * (Self()(0, 0) * Self()(2, 2) - Self()(0, 2) * Self()(2, 0));
+		result(2, 1) = dt * (Self()(0, 1) * Self()(2, 0) - Self()(0, 0) * Self()(2, 1));
+
+		result(0, 2) = dt * (Self()(0, 1) * Self()(1, 2) - Self()(0, 2) * Self()(1, 1));
+		result(1, 2) = dt * (Self()(0, 2) * Self()(1, 0) - Self()(0, 0) * Self()(1, 2));
+		result(2, 2) = dt * (Self()(0, 0) * Self()(1, 1) - Self()(0, 1) * Self()(1, 0));
+	}
+	
+	 SeMatrix Inverse() const
+	{
+		SeMatrix r(Self());
+		Inverse(r);
+		return r;
+	}
+};
+
+template<typename Type>
+SE_INLINE SeMatrix<Type, 3, 3> operator * (const Type & scalar, const SeMatrix<Type, 3, 3> & mat)
+{
+	return mat * scalar;
+}
+
+/*inline Simd3f operator * (const SeMatrix<float, 3, 3> & mat, const Simd3f & vec)
+{
+	return Simd3f(mat * vec.xyz);
+}*/
+
+
+//////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////
+
+
+template<typename Type> class SE_ALIGN(16) SeMatrix<Type, 4, 4>
+{
+
+public:
+
+	static constexpr int DIM = 4;
+	static constexpr int nElements = DIM * DIM;
+
+private:
+
+	union
+	{
+		Type			m_data[nElements];
+		
+		SeVector4<Type> m_columns[DIM];
+	};
+
+	 const SeMatrix & Self() const { return *this; }
+
+public:
+
+	 SeMatrix() {}
+
+	 explicit SeMatrix(const Type & scalar)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] = scalar;
+		}
+	}
+	
+
+	 const	Type & operator() (int i, int j)const	{ return m_data[j*DIM + i]; }
+			Type & operator() (int i, int j)		{ return m_data[j*DIM + i]; }
+
+	 SeMatrix operator - () const
+	{
+		SeMatrix rhs(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			rhs.m_data[i] = -m_data[i];
+		}
+		return rhs;
+	}
+
+	 SeMatrix operator + (const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] + rhs.m_data[i];
+		}
+		return out;
+	}
+
+	 SeMatrix operator - (const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] - rhs.m_data[i];
+		}
+		return out;
+	}
+
+	 SeMatrix & operator += (const SeMatrix & rhs)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] += rhs.m_data[i];
+		}
+		return *this;
+	}
+
+	 SeMatrix & operator -= (const SeMatrix & rhs)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] -= rhs.m_data[i];
+		}
+		return *this;
+	}
+
+
+	 SeMatrix operator * (const Type & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] * rhs;
+		}
+		return out;
+	}
+
+	 SeMatrix & operator *= (const Type & rhs)
+	{
+		for (int i = 0; i < nElements; ++i)
+		{
+			m_data[i] *= rhs;
+		}
+		return *this;
+	}
+
+	 SeVector<Type, DIM> operator * (const SeVector<Type, DIM> & rhs) const
+	{
+		SeVector<Type, DIM> ans(Type(0));
+		for (int i = 0; i < DIM; ++i)
+		{
+			Type s(0);
+			for (int j = 0; j < DIM; ++j)
+			{
+				s += Self()(i, j) * rhs[j];
+			}
+			ans[i] = s;
+		}
+		return ans;
+	}
+
+	template<int K>  SeMatrix<Type, DIM, K> operator * (const SeMatrix<Type, DIM, K>& rhs) const
+	{
+		SeMatrix<Type, DIM, K> ans(Type(0));
+		for (int i = 0; i < DIM; ++i)
+		{
+			for (int k = 0; k < K; ++k)
+			{
+				Type s(0);
+				for (int j = 0; j < DIM; ++j)
+				{
+					s += Self()(i, j) * rhs(j, k);
+				}
+				ans(i, k) = s;
+			}
+		}
+		return ans;
+	}
+
+
+	 SeMatrix P2PProduct(const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (size_t i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] * rhs.m_data[i];
+		}
+		return out;
+	}
+
+	 SeMatrix P2PDivide(const SeMatrix & rhs) const
+	{
+		SeMatrix out(Type(0));
+		for (int i = 0; i < nElements; ++i)
+		{
+			out.m_data[i] = m_data[i] / rhs.m_data[i];
+		}
+		return out;
+	}
+
+	template<int K, int L>  SeMatrix<Type, DIM*K, DIM*L> KroneckerProduct(const SeMatrix<Type, K, L>& rhs) const
+	{
+		SeMatrix<Type, DIM*K, DIM*L> ans(Type(0));
+		for (int i = 0; i < DIM; ++i)
+		{
+			for (int j = 0; j < DIM; ++j)
+			{
+				int rStart = i * K;
+				int cStart = j * L;
+
+				Type ij = Self()(i, j);
+
+				for (int p = 0; p < K; ++p)
+				{
+					for (int q = 0; q < L; ++q)
+					{
+						ans(rStart + p, cStart + q) = ij * rhs(p, q);
+					}
+				}
+			}
+		}
+	}
+
+
+	 SeVector<Type, DIM> Row(int rId) const
+	{
+		SeVector<Type, DIM> x;
+		x[0] = m_data[rId];
+		x[1] = m_data[rId + DIM];
+		x[2] = m_data[rId + DIM * 2];
+		x[3] = m_data[rId + DIM * 3];
+		return x;
+	}
+
+	 SeVector<Type, DIM> Column(int cId) const
+	{
+		return m_columns[cId];
+	}
+
+	 SeVector<Type, DIM> Diagonal()const
+	{
+		SeVector<Type, DIM> x;
+		x[0] = m_data[0];
+		x[1] = m_data[5];
+		x[2] = m_data[10];
+		x[3] = m_data[15];
+		return x;
+	}
+
+	 void SetRow(const SeVector<Type, DIM> & row, int rId)
+	{
+		m_data[rId] = row[0];
+		m_data[rId + DIM] = row[1];
+		m_data[rId + DIM * 2] = row[2];
+		m_data[rId + DIM * 3] = row[3];
+	}
+
+	 void SetColumn(const SeVector<Type, DIM> & column, int cId)
+	{
+		m_columns[cId] = column;
+	}
+
+	 void SetDiagonal(const SeVector<Type, DIM> & diagonal)
+	{
+		for (int i = 0; i < DIM; ++i)
+		{
+			for (int j = 0; j < DIM; ++j)
+			{
+				Self()(i, j) = (i == j) ? diagonal[i] : Type(0);
+			}
+		}
+	}
+
+	 SeMatrix Transpose() const
+	{
+		SeMatrix trans(Type(0));
+		for (int i = 0; i < DIM; ++i)
+		{
+			for (int j = 0; j < DIM; ++j)
+			{
+				trans(j, i) = Self()(i, j);
+			}
+		}
+		return trans;
+	}
+
+	static  SeMatrix Identity(Type diagonal = Type(1))
+	{
+		SeMatrix identity(Type(0));
+		identity(0, 0) = diagonal;
+		identity(1, 1) = diagonal;
+		identity(2, 2) = diagonal;
+		identity(3, 3) = diagonal;
+		return identity;
+	}
+
+	 Type FrobeniusNorm() const
+	{
+		Type sum(0);
+		for (int i = 0; i < nElements; ++i)
+		{
+			sum += m_data[i] * m_data[i];
+		}
+		return std::sqrt(sum);
+	}
+
+	 Type Trace() const
+	{
+		return m_data[0] + m_data[5] + m_data[10] + m_data[15];
+	}
+
+	
+
+	 SeMatrix<Type, 3, 3> GetRotationPart()	 const
+	{
+		SeMatrix<Type, 3, 3> R;
+		for (int y = 0; y < 3; ++y)
+			for (int x = 0; x < 3; ++x)
+				R(y, x) = (*this)(y, x);
+		return R;
+	}
+	
+	 SeVector3<Type> GetTranslationPart() const
+	{
+		SeVector3<Type> t;
+		for (int y = 0; y < 3; ++y)
+			t[y] = (*this)(y, 3);
+		return t;
+	}
+	
+	 void SetRotationPart(const SeMatrix<Type, 3, 3> & R)
+	{
+		for (int y = 0; y < 3; ++y)
+			for (int x = 0; x < 3; ++x)
+			{
+				Self()(y, x) = R(y, x);
+			}
+	}
+	
+	 void SetTranslationPart(const SeVector3<Type> & t)
+	{
+		for (int y = 0; y < 3; ++y)
+		{
+			Self()(y, 3) = t[y];
+		}
+	}
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////
+
+template<typename Type, int N, int M, int K, int L> static  SeMatrix<Type, N* K, M* L> Kronecker(const SeMatrix<Type, N, M>& left, const SeMatrix<Type, K, L>& rhs)
+{
+	SeMatrix<Type, N* K, M* L> ans(Type(0));
+	for (int i = 0; i < N; ++i)
+	{
+		for (int j = 0; j < M; ++j)
+		{
+			int rStart = i * K;
+			int cStart = j * L;
+
+			Type ij = left(i, j);
+
+			for (int p = 0; p < K; ++p)
+			{
+				for (int q = 0; q < L; ++q)
+				{
+					ans(rStart + p, cStart + q) = ij * rhs(p, q);
+				}
+			}
+		}
+	}
+	return ans;
+}
+
+
+template<typename Type, int N, int M> static  SeMatrix<Type, N, 1> GetColumn(const SeMatrix<Type, N, M>& a, int ci)
+{
+	SeMatrix<Type, N, 1> ret;
+	for (int i = 0; i < N; i++)
+	{
+		ret(i, 0) = a(i, ci);
+	}
+	return ret;
+}
+
+
+
+
+//////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////
+
+template<typename Type, int N, int M> SE_INLINE bool HasNan(const SeMatrix<Type, N, M> & mat)
+{
+	for (int i = 0; i < N; ++i)
+	{
+		for (int j = 0; j < M; ++j)
+		{
+			if (std::isnan(mat(i, j)))
+				return true;
+		}
+	}
+	return false;
+}
+
+template<typename T,int N>
+struct AnonymousArray { T data[N]; };
+
+template<typename Type, int Row, int Col>
+static  SeMatrix<Type, Row, Col> CreateWithColumnMajor(const AnonymousArray<Type, Row* Col>& a)
+{
+	SeMatrix<Type, Row, Col> m;
+
+	for (int ri = 0; ri < Row; ri++)
+	{
+		for (int ci = 0; ci < Col; ci++)
+		{
+			m(ri, ci) = a.data[ri + ci * Row];
+		}
+	}
+
+	return m;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+template<typename Type> using SeMatrix2 = SeMatrix<Type, 2, 2>;
+template<typename Type> using SeMatrix3 = SeMatrix<Type, 3, 3>;
+template<typename Type> using SeMatrix4 = SeMatrix<Type, 4, 4>;
+template<typename Type> using SeMatrix9 = SeMatrix<Type, 9, 9>;
+template<typename Type> using SeMatrix12 = SeMatrix<Type, 12, 12>;
+
+using SeMatrix2f = SeMatrix2<float>;
+using SeMatrix3f = SeMatrix3<float>;
+using SeMatrix4f = SeMatrix4<float>;
+using SeMatrix9f = SeMatrix9<float>;
+using SeMatrix12f = SeMatrix12<float>;
+
+using SeMatrix2i = SeMatrix2<int>;
+using SeMatrix3i = SeMatrix3<int>;
+using SeMatrix4i = SeMatrix4<int>;
+
+using SeMatrix23f = SeMatrix<float, 2, 3>;
+using SeMatrix32f = SeMatrix<float, 3, 2>;
+using SeMatrix34f = SeMatrix<float, 3, 4>;
+using SeMatrix43f = SeMatrix<float, 4, 3>;
+
+
+SE_NAMESPACE_END
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeMorton.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeMorton.h
new file mode 100644
index 0000000..1c854b8
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeMorton.h
@@ -0,0 +1,108 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "SeMath.h"
+
+SE_NAMESPACE_BEGIN
+
+/*************************************************************************
+****************************    SeMorton64    ****************************
+*************************************************************************/
+
+/**
+ *	@brief	64-bit Morton-Code object.
+ */
+class SE_ALIGN(8) SeMorton64
+{
+
+public:
+
+	using value_type = unsigned long long;
+
+	//!	@brief	Default constructor.
+	 SeMorton64() {}
+
+	//!	@brief	Convert to vulue_type.
+	 operator value_type() const { return m_Value; }
+
+	//!	@brief	Constructed by a given value.
+	 SeMorton64(value_type _Value) : m_Value(_Value) {}
+
+	//!	@brief	Encoded by the given 3d point located within the unit cube (controlable precision).
+	template<unsigned int precision>  void Encode(float x, float y, float z, value_type lastBits)
+	{
+		static_assert(precision <= 21, "The highest precision for 64-bit Morton code is 21.");
+
+		x = Math::Clamp(x * (1ll << precision), 0.0f, (1ll << precision) - 1.0f);
+		y = Math::Clamp(y * (1ll << precision), 0.0f, (1ll << precision) - 1.0f);
+		z = Math::Clamp(z * (1ll << precision), 0.0f, (1ll << precision) - 1.0f);
+
+		value_type xx = SeMorton64::ExpandBits(static_cast<value_type>(x)) << (66 - 3 * precision);
+		value_type yy = SeMorton64::ExpandBits(static_cast<value_type>(y)) << (65 - 3 * precision);
+		value_type zz = SeMorton64::ExpandBits(static_cast<value_type>(z)) << (64 - 3 * precision);
+
+		constexpr value_type bitMask = ~value_type(0) >> (3 * precision);
+
+		m_Value = xx + yy + zz + (lastBits & bitMask);
+	}
+
+	//!	@brief	Encoded by the given 3d point located within the unit cube (full precision).
+	 void Encode(float x, float y, float z)
+	{
+		x = Math::Clamp(x * (1ll << 21), 0.0f, (1ll << 21) - 1.0f);
+		y = Math::Clamp(y * (1ll << 21), 0.0f, (1ll << 21) - 1.0f);
+		z = Math::Clamp(z * (1ll << 21), 0.0f, (1ll << 21) - 1.0f);
+
+		value_type xx = SeMorton64::ExpandBits(static_cast<value_type>(x));
+		value_type yy = SeMorton64::ExpandBits(static_cast<value_type>(y));
+		value_type zz = SeMorton64::ExpandBits(static_cast<value_type>(z));
+
+		m_Value = (xx << 2) + (yy << 1) + zz;
+	}
+
+private:
+
+	/**
+	 *	@brief	Expand bits by inserting two zeros after each bit.
+	 *	@e.g.	0000 0000 1111  ->  0010 0100 1001
+	 */
+	static  value_type ExpandBits(value_type bits)
+	{
+		bits = (bits | (bits << 32)) & 0xFFFF00000000FFFFu;
+		bits = (bits | (bits << 16)) & 0x00FF0000FF0000FFu;
+		bits = (bits | (bits <<  8)) & 0xF00F00F00F00F00Fu;
+		bits = (bits | (bits <<  4)) & 0x30C30C30C30C30C3u;
+		return (bits | (bits <<  2)) & 0x9249249249249249u;
+	}
+
+private:
+
+	value_type		m_Value;
+};
+
+SE_NAMESPACE_END
\ No newline at end of file
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeOmp.cpp b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeOmp.cpp
new file mode 100644
index 0000000..4a69503
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeOmp.cpp
@@ -0,0 +1,33 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "SeOmp.h"
+
+#ifdef WIN32
+    int CPU_THREAD_NUM = (omp_get_num_procs() - 1) > 1 ? (omp_get_num_procs() - 1) : 1;
+#else
+    int CPU_THREAD_NUM =  1;
+#endif
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeOmp.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeOmp.h
new file mode 100644
index 0000000..1c1f063
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeOmp.h
@@ -0,0 +1,37 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <omp.h>
+
+extern int CPU_THREAD_NUM;
+
+#define OMP_BARRIER __pragma(omp barrier)
+#define OMP_PARALLEL __pragma(omp parallel num_threads(CPU_THREAD_NUM))
+#define OMP_FOR __pragma(omp for)
+#define OMP_PARALLEL_FOR __pragma(omp parallel for num_threads(CPU_THREAD_NUM)) 
+#define OMP_PARALLEL_FOR_SUM_REDUCTION(sum) __pragma(omp parallel for num_threads(CPU_THREAD_NUM) reduction(+: sum)) 
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SePreDefine.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SePreDefine.h
new file mode 100644
index 0000000..908c8b2
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SePreDefine.h
@@ -0,0 +1,81 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include <assert.h>
+
+/*************************************************************************
+***************************    Min_Max_Swap    ***************************
+*************************************************************************/
+
+#define SE_MIN(a, b)		(((a) < (b)) ? (a) : (b))
+#define SE_MAX(a, b)		(((a) > (b)) ? (a) : (b))
+#define SE_SWAP(a, b)		{ auto c = a; a = b; b = c;}
+
+/*************************************************************************
+****************************    Namespace    *****************************
+*************************************************************************/
+
+#define SE_NAMESPACE SE
+#define SE_NAMESPACE_BEGIN namespace SE_NAMESPACE {
+#define SE_USING_NAMESPACE using namespace SE_NAMESPACE; 
+#define SE_NAMESPACE_END }
+
+/*************************************************************************
+***************************    Noncopyable    ****************************
+*************************************************************************/
+
+#define SE_NONCOPYABLE(className)										\
+																		\
+	className(const className&) = delete;								\
+																		\
+	void operator=(const className&) = delete;							\
+
+/*************************************************************************
+****************************    DLL_Export    ****************************
+*************************************************************************/
+
+	
+#define SE_INLINE					__inline
+#define SE_ALIGN(n)					__declspec(align(n))		//!	Only for windows platform.
+#define SE_ASSERT(expression)		assert(expression)
+
+
+/*************************************************************************
+*****************************    Casting    ******************************
+*************************************************************************/
+
+#define SE_SCI(ClassObject)			static_cast<int>(ClassObject)
+#define SE_SCF(ClassObject)			static_cast<float>(ClassObject)
+#define SE_SCD(ClassObject)			static_cast<double>(ClassObject)
+#define SE_SCU(ClassObject)			static_cast<unsigned int>(ClassObject)
+
+#ifndef NOMINMAX
+	#define NOMINMAX
+#endif
\ No newline at end of file
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeSchwarzPreconditioner.cpp b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeSchwarzPreconditioner.cpp
new file mode 100644
index 0000000..f8d6e9b
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeSchwarzPreconditioner.cpp
@@ -0,0 +1,1719 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "SeSchwarzPreconditioner.h"
+
+#include "SeIntrinsic.h"
+
+#include <algorithm>
+#include <unordered_map>
+
+
+SE_USING_NAMESPACE
+
+
+void SeSchwarzPreconditioner::AllocatePrecoditioner(int numVerts, int numEdges, int numFaces)
+{
+	m_numVerts = numVerts;
+	m_numEdges = numEdges;
+	m_numFaces = numFaces;
+
+	if (m_frameIndex == 0)
+	{
+		DoAlllocation();
+	}
+
+	if (m_frameIndex % 17 != 0)
+	{
+		return;
+	}
+
+	//==== Reorder 
+
+	ComputeTotalAABB();
+
+	SpaceSort();
+
+	ComputeInverseMapper();
+
+	MapHessianTable();
+
+	m_frameIndex++;
+}
+
+void SeSchwarzPreconditioner::PreparePreconditioner
+(
+	const SeMatrix3f* diagonal, const SeMatrix3f* csrOffDiagonals, const int* csrRanges,
+	const EfSet* efSets, const EeSet* eeSets, const VfSet* vfSets, 
+	unsigned int* efCounts, unsigned int* eeCounts, unsigned int* vfCounts
+)
+{
+	m_mappedNeighborsRemain = m_mappedNeighbors;
+	m_mappedNeighborsNumRemain = m_mappedNeighborsNum;
+
+	//==== before reordering
+
+	PrepareCollisionStencils(efSets, eeSets, vfSets, efCounts, eeCounts, vfCounts);
+
+	//====
+
+	ReorderRealtime();
+	
+	//====
+	
+	const int blockNum = m_totalSz / 32;
+	Utility::MemsetZero(m_hessian32.Ptr(), m_hessian32.Size());
+	Utility::MemsetZero(m_additionalHessian32);
+	
+	//====
+
+	PrepareCollisionHessian(); //3
+
+	PrepareHessian(diagonal, csrOffDiagonals, csrRanges);
+
+	LDLtInverse512();
+}
+
+void SeSchwarzPreconditioner::Preconditioning(SeVec3fSimd* z, const SeVec3fSimd* residual, int dim)
+{
+	Utility::MemsetZero(m_mappedZ); 
+	Utility::MemsetZero(m_mappedR);
+
+	BuildResidualHierarchy(residual);
+
+	SchwarzLocalXSym();
+
+	CollectFinalZ(z);
+}
+
+void SeSchwarzPreconditioner::ComputeLevelNums(int bankSize)
+{
+	constexpr float SizeRatio = 1.5f;
+
+	//====
+
+	int totalSz = 0;
+
+	int nLevel = 1;
+	int levelSz = (m_numVerts + bankSize - 1) / bankSize * bankSize;
+	totalSz += levelSz;
+
+	while (levelSz > 32)
+	{
+		levelSz /= 32;
+
+		nLevel++;
+		levelSz = (levelSz + bankSize - 1) / bankSize * bankSize;
+		totalSz += levelSz;
+	}
+
+	m_numLevel = nLevel;
+	m_totalSz = totalSz * SizeRatio;
+}
+
+void SeSchwarzPreconditioner::DoAlllocation()
+{
+	constexpr int bankSize = 32;	//==== size of diagonal block
+
+	ComputeLevelNums(bankSize);
+
+	//====
+
+	m_MapperSortedGetOriginal.resize(m_numVerts);
+	m_mapperOriginalGetSorted.resize(m_numVerts);
+	m_mortonCode.resize(m_numVerts);
+
+	m_hessian32.Resize(bankSize, m_totalSz);
+	m_mappedZ.resize(m_totalSz);
+	m_mappedR.resize(m_totalSz);
+
+	m_mappedPos.resize(m_numVerts);
+
+	m_denseLevel.resize(m_numVerts);
+	m_goingNext.resize(m_numLevel * m_numVerts);
+	m_coarseTables.resize(m_numVerts);
+
+	m_prefixOrignal.resize(m_numVerts);
+	m_fineConnectMask.resize(m_numVerts);
+	m_nextConnectMsk.resize(m_numVerts);
+	m_nextPrefix.resize(m_numVerts);
+
+
+	int triSz = (1 + 96) * 96 / 2 + 16 * 3;
+	const int blockNum = m_totalSz / 32;
+	m_invSymR.resize(triSz * blockNum);
+	m_additionalHessian32.resize(m_totalSz + 1);
+
+	//====
+
+	m_levelSize.resize(m_numLevel + 1);
+	m_CoarseSpaceTables.Resize(m_numLevel, m_numVerts);
+
+	int maxNeighbours = 0;
+
+	for (int vId = 0; vId < m_numVerts; ++vId)
+	{
+		maxNeighbours = Math::Max(m_neighbours->Size(vId) + 1, maxNeighbours);
+	}
+
+	m_mappedNeighborsNum.resize(m_numVerts);
+	m_mappedNeighborsNumRemain.resize(m_numVerts);
+	m_mappedNeighbors.Resize(maxNeighbours, m_numVerts);
+	m_mappedNeighborsRemain.Resize(maxNeighbours, m_numVerts);
+
+	const int maxCollisionPerVert = 32;
+	m_maxStencilNum = m_numVerts * maxCollisionPerVert;
+	m_stencils.resize(m_maxStencilNum);
+	m_stencilNum = 0;
+}
+
+void SeSchwarzPreconditioner::ComputeTotalAABB()
+{
+	m_aabb = SeAabb<SeVec3fSimd>();
+
+	ComputeAABB(); // TODO: omp reduce
+}
+
+
+void SeSchwarzPreconditioner::ComputeAABB()
+{
+
+//MSVC compiler is stuck with OpenMP version 2.0, and unfortunately for you, reduction(max:) was only introduced with version 3.1 of the OpenMP C/C++ standard (that was in September 2011) 
+
+//#pragma omp parallel for reduction(aabbAdd:m_aabb)
+	for (int vid = 0; vid < m_numVerts; ++vid) // need omp custom reduction operation
+	{
+		m_aabb += m_positions[vid];
+	}
+}
+
+void SeSchwarzPreconditioner::SpaceSort()
+{
+	FillSortingData();
+	DoingSort();
+}
+
+void SeSchwarzPreconditioner::FillSortingData()
+{
+	OMP_PARALLEL_FOR
+
+		for (int vid = 0; vid < m_numVerts; ++vid)
+		{
+			SeVec3fSimd temp = (m_positions[vid] - m_aabb.Lower) / m_aabb.Extent();
+
+			SeMorton64 mt;
+
+			mt.Encode(temp.x, temp.y, temp.z);
+
+			m_mortonCode[vid] = mt;
+
+			m_MapperSortedGetOriginal[vid] = vid;
+		}
+}
+
+
+void SeSchwarzPreconditioner::DoingSort()
+{
+	//==== sort m_MapperSortedGetOriginal by m_mortonCode1;
+
+	std::sort(m_MapperSortedGetOriginal.begin(), m_MapperSortedGetOriginal.end(), [&](int i, int j) { return m_mortonCode[i] < m_mortonCode[j]; });
+}
+
+void SeSchwarzPreconditioner::ComputeInverseMapper()
+{
+	OMP_PARALLEL_FOR
+
+		for (int vid = 0; vid < m_numVerts; ++vid)
+		{
+			int originalIndex = m_MapperSortedGetOriginal[vid];
+
+			m_mapperOriginalGetSorted[originalIndex] = vid;
+		}
+}
+
+
+void SeSchwarzPreconditioner::MapHessianTable()
+{
+	OMP_PARALLEL_FOR
+
+		for (int vid = 0; vid < m_numVerts; ++vid)
+		{
+			const auto  originalIndex = m_MapperSortedGetOriginal[vid];
+
+			m_mappedPos[vid] = m_positions[originalIndex];
+
+			//====
+
+			int numNeighbors = m_neighbours->Size(originalIndex);
+
+			const int* neighbors = m_neighbours->IdxPtr(originalIndex);
+
+			m_mappedNeighborsNum[vid] = numNeighbors + 1;	// include self
+
+			m_mappedNeighbors[0][vid] = vid;				// include self
+
+			for (int k = 1; k < numNeighbors + 1; ++k)
+			{
+				unsigned int originalNeighbors = neighbors[k - 1];//m_neighbors[k][originalIndex];
+
+				m_mappedNeighbors[k][vid] = m_mapperOriginalGetSorted[originalNeighbors];
+			}
+		}
+}
+
+void SeSchwarzPreconditioner::MapCollisionStencilIndices()
+{
+	m_stencilIndexMapped.resize(m_stencilNum);
+
+	OMP_PARALLEL_FOR
+
+		for (int i = 0; i < m_stencilNum; i++)
+		{
+			const auto& s = m_stencils[i];
+
+			for (int vi = 0; vi < s.verextNumPerStencil; vi++)
+			{
+				m_stencilIndexMapped[i][vi] = m_mapperOriginalGetSorted[s.index[vi]];
+			}
+		}
+}
+
+void SeSchwarzPreconditioner::PrepareCollisionStencils(const EfSet* efSets, const EeSet* eeSets, const VfSet* vfSets, unsigned int* efCounts, unsigned int* eeCounts, unsigned int* vfCounts)
+{
+	int efNum = efCounts[m_numEdges];
+	int eeNum = eeCounts[m_numEdges];
+	int vfNum = vfCounts[m_numVerts];
+
+	int totalStencilNum = efNum + eeNum + vfNum;
+
+	if (totalStencilNum > m_maxStencilNum)
+	{
+		totalStencilNum = m_maxStencilNum;
+		printf("stencil size %d exceed max stencils num  %d\n", totalStencilNum, m_maxStencilNum);
+	}
+
+	m_stencilNum = 0;
+
+	OMP_PARALLEL_FOR
+
+		for (int i = 0; i < totalStencilNum; i++)
+		{
+			Stencil s;
+
+			if (i < efNum) //process ef
+			{
+				const auto& pair = efSets[i];
+
+				if (pair.m_eId < 0 || pair.m_fId < 0) { continue; }
+
+				const auto& edge = m_edges[pair.m_eId];
+				const auto& face = m_faces[pair.m_fId];
+
+				s.verextNumPerStencil = 5;
+				s.vertexNumOfFirstPrimitive = 2;
+
+				s.index[0] = edge[0];
+				s.index[1] = edge[1];
+				s.index[2] = face[0];
+				s.index[3] = face[1];
+				s.index[4] = face[2];
+
+				s.weight[0] = pair.m_bary[0];
+				s.weight[1] = 1.f - pair.m_bary[0];
+				s.weight[2] = -pair.m_bary[1];
+				s.weight[3] = -pair.m_bary[2];
+				s.weight[4] = -(1.f - pair.m_bary[1] - pair.m_bary[2]);
+
+				s.direction = pair.m_normal;
+
+				s.stiff = pair.stiff;
+
+			}
+			else if (i < efNum + eeNum) // process ee
+			{
+				const auto& pair = eeSets[i];
+
+				if (pair.m_eId1 < 0 || pair.m_eId0 < 0) { continue; }
+
+				s.verextNumPerStencil = 4;
+				s.vertexNumOfFirstPrimitive = 2;
+
+				const auto& edge0 = m_edges[pair.m_eId0];
+				const auto& edge1 = m_edges[pair.m_eId1];
+
+				s.index[0] = edge0[0];
+				s.index[1] = edge0[1];
+				s.index[2] = edge1[0];
+				s.index[3] = edge1[1];
+
+				s.weight[0] = pair.m_bary[0];
+				s.weight[1] = 1.f - pair.m_bary[0];
+				s.weight[2] = -pair.m_bary[1];
+				s.weight[3] = -(1.f - pair.m_bary[1]);
+
+				s.direction = pair.m_normal;
+
+				s.stiff = pair.stiff;
+			}
+			else if (i < totalStencilNum) //process vf
+			{
+				const auto& pair = vfSets[i];
+
+				if (pair.m_vId < 0 || pair.m_fId < 0) { continue; }
+
+				s.verextNumPerStencil = 4;
+				s.vertexNumOfFirstPrimitive = 3;
+				
+				const auto& face = m_faces[pair.m_fId];
+				
+				s.index[0] = face[0];
+				s.index[1] = face[1];
+				s.index[2] = face[2];
+				s.index[3] = pair.m_vId;
+
+				s.weight[0] = -pair.m_bary[0];
+				s.weight[1] = -pair.m_bary[1];
+				s.weight[2] = -(1.f - pair.m_bary[2]);
+				s.weight[3] = 1.f;
+
+				s.direction = pair.m_normal;
+
+				s.stiff = pair.stiff;
+			}
+
+			int stencilNum = Intrinsic::AtomicAdd(&m_stencilNum, 1);
+
+			m_stencils[stencilNum] = s;
+		}
+
+	MapCollisionStencilIndices(); //1
+}
+
+void SeSchwarzPreconditioner::ReorderRealtime()
+{
+	Utility::MemsetZero(m_levelSize);
+
+	BuildConnectMaskL0();
+
+	BuildCollisionConnection(m_fineConnectMask.data(), nullptr); //2
+
+	PreparePrefixSumL0();
+
+	BuildLevel1();
+
+	for (int level = 1; level < m_numLevel; level++)
+	{
+		Utility::MemsetZero(m_nextConnectMsk);
+
+		BuildConnectMaskLx(level);
+
+		BuildCollisionConnection(m_nextConnectMsk.data(), m_CoarseSpaceTables[level - 1]);
+
+		NextLevelCluster(level);
+
+		PrefixSumLx(level);
+
+		ComputeNextLevel(level);
+	}
+
+	TotalNodes();
+
+	AggregationKernel();
+}
+
+void SeSchwarzPreconditioner::BuildConnectMaskL0()
+{
+	const unsigned int warpDim = 32;
+
+	int nWarps = (m_numVerts + warpDim - 1) / warpDim;
+
+	OMP_PARALLEL_FOR
+
+		for (int warpIdx = 0; warpIdx < nWarps; ++warpIdx)
+		{
+			for (int laneIdx = 0; laneIdx < warpDim; ++laneIdx)
+			{
+				int vId = warpIdx * warpDim + laneIdx;
+
+				if (vId >= m_numVerts)
+				{
+					break;
+				}
+
+				int numNeighbors = m_mappedNeighborsNumRemain[vId];
+
+				unsigned int connetMsk = 1U << laneIdx;	// include self
+
+				int nk = 0;
+
+				for (int k = 0; k < numNeighbors; k++)
+				{
+					int vIdConnected = m_mappedNeighborsRemain[k][vId];
+
+					int warpIdxConnected = vIdConnected / warpDim;
+
+					if (warpIdx == warpIdxConnected) // in the same warp
+					{
+						unsigned int laneIdxConnected = vIdConnected % warpDim;
+
+						connetMsk |= (1U << laneIdxConnected);
+					}
+					else
+					{
+						m_mappedNeighborsRemain[nk][vId] = vIdConnected;
+						nk++;
+					}
+				}
+
+				m_mappedNeighborsNumRemain[vId] = nk;
+
+				// distance cullling
+				//const SeVec3fSimd& pos = m_mappedPos[vId];
+				//for (int it = 0; it < Intrinsic::Popcount32(activeMsk); it++)
+				//{
+				//	const float th = AABB_TH * AABB_TH;
+				//	CuFloat3 po(0.0f);
+				//	po.x = Intrinsic::Shuffle(activeMsk, pos.x, it);
+				//	po.y = Intrinsic::Shuffle(activeMsk, pos.y, it);
+				//	po.z = Intrinsic::Shuffle(activeMsk, pos.z, it);
+				//	if (Math::SqrLength(po - pos) < th)
+				//	{
+				//		connetMsk |= (1U << it);
+				//	}
+				//}
+
+				m_fineConnectMask[vId] = connetMsk;
+			}
+		}
+}
+
+
+void SeSchwarzPreconditioner::BuildCollisionConnection(unsigned int* pConnect, const int* pCoarseTable)
+{
+	const int bank = 32;
+
+	OMP_PARALLEL_FOR
+
+		for (int i = 0; i < m_stencilNum; i++)
+		{
+			const auto& s = m_stencils[i];
+			Int5 idx = m_stencilIndexMapped[i];
+			unsigned int connMsk[5] = {};
+
+			if (pCoarseTable)
+			{
+				for (int it = 0; it < s.verextNumPerStencil; it++)
+				{
+					idx[it] = pCoarseTable[idx[it]];
+				}
+			}
+
+			for (int ita = 0; ita < s.verextNumPerStencil; ita++)
+			{
+				for (int itb = ita + 1; itb < s.verextNumPerStencil; itb++)
+				{
+					unsigned int myID = idx[ita];
+					unsigned int otID = idx[itb];
+					if (myID == otID) // redundant ?
+					{
+						continue;
+					}
+					if ((myID / bank == otID / bank))
+					{
+						if (ita < s.vertexNumOfFirstPrimitive && itb >= s.vertexNumOfFirstPrimitive)
+						{
+							connMsk[ita] |= (1U << (otID % bank));
+							connMsk[itb] |= (1U << (myID % bank));
+						}
+					}
+				}
+			}
+
+			for (int it = 0; it < s.verextNumPerStencil; it++)
+			{
+				if (connMsk[it])
+				{
+					Intrinsic::AtomicOr(&pConnect[idx[it]], connMsk[it]);
+				}
+			}
+		}
+}
+
+void SeSchwarzPreconditioner::PreparePrefixSumL0()
+{
+	int warpDim = 32;
+
+	int nWarps = (m_numVerts + warpDim - 1) / warpDim;
+
+	OMP_PARALLEL_FOR
+
+		for (int warpIdx = 0; warpIdx < nWarps; ++warpIdx)
+		{
+			unsigned int cacheMask[32];
+
+			for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+			{
+				int vId = laneIdx + warpIdx * warpDim;
+
+				if (vId >= m_numVerts) { break; }
+
+				cacheMask[laneIdx] = m_fineConnectMask[vId];
+			}
+
+			Intrinsic::SyncWarp();
+
+			int prefixSum = 0;
+
+			for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+			{
+				int vid = laneIdx + warpIdx * warpDim;
+
+				if (vid >= m_numVerts) { break; }
+
+				unsigned int connetMsk = cacheMask[laneIdx];
+
+				unsigned int visited = (1U << laneIdx);
+
+				while (connetMsk != -1) // 
+				{
+					unsigned int todo = visited ^ connetMsk; // 
+
+					if (!todo)
+					{
+						break;
+					}
+
+					unsigned int nextVisit = Intrinsic::Ffs(todo) - 1;
+
+					visited |= (1U << nextVisit);
+
+					connetMsk |= cacheMask[nextVisit];
+				}
+
+				m_fineConnectMask[vid] = connetMsk;
+
+				unsigned int electedPrefix = Intrinsic::Popcount32(connetMsk & Intrinsic::LanemaskLt(laneIdx));
+
+				if (electedPrefix == 0)
+				{
+					prefixSum++;
+				}
+			}
+
+			m_prefixOrignal[warpIdx] = prefixSum;
+		}
+}
+
+void SeSchwarzPreconditioner::BuildLevel1()
+{
+	constexpr unsigned int blockDim = 1024;//32*32
+
+	int nBlocks = (m_numVerts + blockDim - 1) / blockDim;
+
+	OMP_PARALLEL_FOR
+
+		for (int blockIdx = 0; blockIdx < nBlocks; ++blockIdx)
+		{
+			unsigned int warpPrefix[32];
+
+			unsigned int theBlockGlobalPrefix = 0;
+
+			unsigned int globalWarpOffset = blockIdx * 32;
+
+			for (int warpIdx = 0; warpIdx < blockIdx; ++warpIdx)
+			{
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					unsigned int threadIdx = laneIdx + warpIdx * 32;
+
+					theBlockGlobalPrefix += m_prefixOrignal[threadIdx];
+				}
+			}
+
+			for (int warpIdx = 0; warpIdx < 32; ++warpIdx)
+			{
+				warpPrefix[warpIdx] = m_prefixOrignal[globalWarpOffset + warpIdx];
+			}
+
+			Intrinsic::SyncBlock();
+
+			for (unsigned int warpIdx = 0; warpIdx < 32; ++warpIdx)
+			{
+				unsigned int theWarpGlobalPrefix = theBlockGlobalPrefix;
+
+				for (unsigned int prevWarpIdx = 0; prevWarpIdx < warpIdx; ++prevWarpIdx)
+				{
+					theWarpGlobalPrefix += warpPrefix[prevWarpIdx];
+				}
+
+
+				unsigned int electedMask = 0;
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					unsigned int threadIdx = laneIdx + warpIdx * 32;
+
+					int vid = threadIdx + blockIdx * blockDim;
+
+					if (vid >= m_numVerts) { break; }
+
+
+					if (vid == m_numVerts - 1)
+					{
+						m_levelSize[1].x = theWarpGlobalPrefix + warpPrefix[warpIdx];
+						m_levelSize[1].y = (m_numVerts + 31) / 32 * 32;
+					}
+
+					unsigned int connMsk = m_fineConnectMask[vid];
+
+					unsigned int electedPrefix = Intrinsic::Popcount32(connMsk & Intrinsic::LanemaskLt(laneIdx));
+
+					if (electedPrefix == 0)
+					{
+						electedMask |= 1U << laneIdx;
+					}
+					//unsigned int electedMask = Intrinsic::BallotSync(-1, electedPrefix == 0);
+				}
+
+
+				unsigned int lanePrefix[32];
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					unsigned int threadIdx = laneIdx + warpIdx * 32;
+
+					int vid = threadIdx + blockIdx * blockDim;
+
+					if (vid >= m_numVerts) { break; }
+
+
+					lanePrefix[laneIdx] = Intrinsic::Popcount32(electedMask & Intrinsic::LanemaskLt(laneIdx));
+
+					lanePrefix[laneIdx] += theWarpGlobalPrefix;
+				}
+
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					unsigned int threadIdx = laneIdx + warpIdx * 32;
+
+					int vid = threadIdx + blockIdx * blockDim;
+
+					if (vid >= m_numVerts) { break; }
+
+
+					unsigned int connMsk = m_fineConnectMask[vid];
+
+					unsigned int elected_lane = Intrinsic::Ffs(connMsk) - 1;
+
+					unsigned int theLanePrefix = lanePrefix[elected_lane];
+
+					m_CoarseSpaceTables[0][vid] = theLanePrefix;
+
+					m_goingNext[vid] = theLanePrefix + (m_numVerts + 31) / 32 * 32;
+				}
+			}
+		}
+}
+
+
+void SeSchwarzPreconditioner::BuildConnectMaskLx(int level)
+{
+	constexpr unsigned int warpDim = 32;
+
+	int nWarps = (m_numVerts + warpDim - 1) / warpDim;
+
+	const unsigned int bank = 32;
+
+	OMP_PARALLEL_FOR
+
+		for (int warpIdx = 0; warpIdx < nWarps; ++warpIdx)
+		{
+			unsigned int prefixMsk[32];
+			unsigned int connectMsk[32];
+
+			for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+			{
+				int vid = laneIdx + warpIdx * 32;
+
+				if (vid >= m_numVerts) { break; }
+
+				prefixMsk[laneIdx] = m_fineConnectMask[vid];
+
+				unsigned int coarseVid = m_CoarseSpaceTables[level - 1][vid];
+
+				connectMsk[laneIdx] = 0;// 1U << (coarseVid % bank);	//==== include self
+
+				unsigned int kn = m_mappedNeighborsNumRemain[vid];
+
+				unsigned int nk = 0;
+
+				for (unsigned int k = 0; k < kn; k++)
+				{
+					unsigned int connect = m_mappedNeighborsRemain[k][vid];
+
+					unsigned int coarseConnect = m_CoarseSpaceTables[level - 1][connect];
+
+					if (coarseVid / bank == coarseConnect / bank)
+					{
+						unsigned int off = coarseConnect % bank;
+
+						connectMsk[laneIdx] |= (1U << off);
+					}
+					else
+					{
+						m_mappedNeighborsRemain[nk][vid] = connect;
+						nk++;
+					}
+				}
+
+				m_mappedNeighborsNumRemain[vid] = nk;
+			}
+
+			bool isFullWarp = (prefixMsk[0] == -1);
+
+			if (isFullWarp)
+			{
+				unsigned int connectMskFull = 0;
+
+				for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * 32;
+
+					if (vid >= m_numVerts) { break; }
+
+					connectMskFull |= connectMsk[laneIdx];
+				}
+				for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * 32;
+
+					if (vid >= m_numVerts) { break; }
+
+					connectMsk[laneIdx] = connectMskFull;
+				}
+			}
+			else
+			{
+				unsigned int cacheMsk[32];
+
+				for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					cacheMsk[laneIdx] = 0;
+				}
+
+				unsigned int electedLane[32];
+
+				for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * 32;
+
+					if (vid >= m_numVerts) { break; }
+
+					electedLane[laneIdx] = Intrinsic::Ffs(prefixMsk[laneIdx]) - 1;
+
+					if (connectMsk[laneIdx])
+					{
+						//cacheMsk[electedLane[laneIdx]] |= connectMsk[laneIdx];
+						Intrinsic::AtomicOr(&cacheMsk[electedLane[laneIdx]], connectMsk[laneIdx]);
+					}
+				}
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * 32;
+
+					if (vid >= m_numVerts) { break; }
+
+					connectMsk[laneIdx] = cacheMsk[electedLane[laneIdx]];
+				}
+			}
+
+			for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+			{
+				int vid = laneIdx + warpIdx * 32;
+
+				if (vid >= m_numVerts) { break; }
+
+				unsigned int coarseVid = m_CoarseSpaceTables[level - 1][vid];
+
+				unsigned int electedPrefix = Intrinsic::Popcount32(prefixMsk[laneIdx] & Intrinsic::LanemaskLt(laneIdx));
+
+				if (connectMsk[laneIdx] && electedPrefix == 0)
+				{
+					Intrinsic::AtomicOr(m_nextConnectMsk.data() + coarseVid, connectMsk[laneIdx]);
+				}
+			}
+		}
+}
+
+void SeSchwarzPreconditioner::NextLevelCluster(int level)
+{
+	const int levelNum = m_levelSize[level].x;
+
+	const int warpDim = 32;
+
+	int numWarps = (m_numVerts + warpDim - 1) / warpDim;
+
+	int maxWarpIdx = (levelNum + warpDim - 1) / warpDim;
+
+	OMP_PARALLEL_FOR
+
+		for (int warpIdx = 0; warpIdx < numWarps; ++warpIdx)
+		{
+			unsigned int cachedMsk[32];
+
+			for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+			{
+				int vid = laneIdx + warpIdx * 32;
+
+				if (vid >= (levelNum + 31) / 32 * 32)
+				{
+					break;// early culling
+				}
+
+				unsigned int connectMsk = (1U << laneIdx);
+
+				if (vid < levelNum)
+				{
+					connectMsk |= m_nextConnectMsk[vid];// connect to current 
+				}
+
+				cachedMsk[laneIdx] = connectMsk;
+
+				if (vid >= levelNum)
+				{
+					break;
+				}
+			}
+
+			Intrinsic::SyncWarp();
+
+			unsigned int prefixSum = 0;
+
+			for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+			{
+				int vid = laneIdx + warpIdx * 32;
+
+				if (vid >= levelNum || vid >= (levelNum + 31) / 32 * 32)
+				{
+					break;// early culling
+				}
+
+				unsigned int connectMsk = cachedMsk[laneIdx];
+
+				unsigned int visited = (1U << laneIdx);
+
+				while (true)
+				{
+					unsigned int todo = visited ^ connectMsk;
+
+					if (!todo)
+					{
+						break;
+					}
+
+					unsigned int nextVisist = Intrinsic::Ffs(todo) - 1;
+
+					visited |= (1U << nextVisist);
+
+					connectMsk |= cachedMsk[nextVisist];
+				}
+
+				m_nextConnectMsk[vid] = connectMsk;
+
+				unsigned int electedPrefix = Intrinsic::Popcount32(connectMsk & Intrinsic::LanemaskLt(laneIdx));
+
+				if (electedPrefix == 0)
+				{
+					prefixSum++;
+				}
+			}
+
+			if (warpIdx < maxWarpIdx)
+			{
+				m_nextPrefix[warpIdx] = prefixSum;
+			}
+		}
+}
+
+void SeSchwarzPreconditioner::PrefixSumLx(int level)
+{
+	const int levelNum = m_levelSize[level].x;
+
+	const int levelBegin = m_levelSize[level].y;
+
+	unsigned int blockDim = 1024;
+
+	int nBlocks = (m_numVerts + blockDim - 1) / blockDim;
+
+	OMP_PARALLEL_FOR
+
+		for (int blockIdx = 0; blockIdx < nBlocks; ++blockIdx)
+		{
+			unsigned int warpPrefix[32];
+
+			unsigned int theBlockPrefix = 0;
+
+			unsigned int globalWarpOffset = blockIdx * 32;
+
+			for (int warpIdx = 0; warpIdx < blockIdx; ++warpIdx)
+			{
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					unsigned int threadIdx = laneIdx + warpIdx * 32;
+
+					unsigned int vid = threadIdx + blockIdx * blockDim;
+
+					if (vid >= (levelNum + blockDim - 1) / blockDim * blockDim)
+					{
+						break;
+					}
+
+					theBlockPrefix += m_nextPrefix[threadIdx];
+				}
+			}
+
+			for (int warpIdx = 0; warpIdx < 32; ++warpIdx)
+			{
+				warpPrefix[warpIdx] = m_nextPrefix[globalWarpOffset + warpIdx];
+			}
+
+			Intrinsic::SyncBlock();
+
+			for (unsigned int warpIdx = 0; warpIdx < 32; ++warpIdx)
+			{
+				unsigned int theWarpPrefix = theBlockPrefix;
+
+				for (unsigned int prevWarpIdx = 0; prevWarpIdx < warpIdx; ++prevWarpIdx)
+				{
+					theWarpPrefix += warpPrefix[prevWarpIdx];
+				}
+
+				unsigned int electedMsk = 0;
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					unsigned int threadIdx = laneIdx + warpIdx * 32;
+
+					int vid = threadIdx + blockIdx * blockDim;
+
+					if (vid >= levelNum)
+					{
+						break;
+					}
+					if (vid == levelNum - 1)
+					{
+						m_levelSize[level + 1].x = theWarpPrefix + warpPrefix[warpIdx];
+						m_levelSize[level + 1].y = levelBegin + (levelNum + 31) / 32 * 32;
+					}
+
+					unsigned int connMsk = m_nextConnectMsk[vid];
+
+					unsigned int electedPrefix = Intrinsic::Popcount32(connMsk & Intrinsic::LanemaskLt(laneIdx));
+
+					if (electedPrefix == 0)
+					{
+						electedMsk |= (1U << laneIdx);
+					}
+
+					//unsigned int electedMask = Intrinsic::BallotSync(-1, electedPrefix == 0);
+				}
+
+				unsigned int lanePrefix[32];
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					lanePrefix[laneIdx] = Intrinsic::Popcount32(electedMsk & Intrinsic::LanemaskLt(laneIdx));
+					lanePrefix[laneIdx] += theWarpPrefix;
+				}
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					unsigned int threadIdx = laneIdx + warpIdx * 32;
+
+					int vid = threadIdx + blockIdx * blockDim;
+
+					if (vid >= levelNum) { break; }
+
+					int electedLane = Intrinsic::Ffs(m_nextConnectMsk[vid]) - 1;
+
+					unsigned int theLanePrefix = lanePrefix[electedLane];
+
+					m_nextConnectMsk[vid] = theLanePrefix;
+
+					m_goingNext[vid + levelBegin] = theLanePrefix + levelBegin + (levelNum + 31) / 32 * 32;
+				}
+			}
+		}
+}
+
+void SeSchwarzPreconditioner::ComputeNextLevel(int level)
+{
+	OMP_PARALLEL_FOR
+
+		for (int vid = 0; vid < m_numVerts; ++vid)
+		{
+			int next = m_CoarseSpaceTables[level - 1][vid];
+
+			m_CoarseSpaceTables[level][vid] = m_nextConnectMsk[next];
+		}
+}
+
+void SeSchwarzPreconditioner::TotalNodes()
+{
+	m_totalNumberClusters = m_levelSize[m_numLevel].y;
+	//printf("%d\n", m_totalNumberClusters);
+}
+
+void SeSchwarzPreconditioner::AggregationKernel()
+{
+	const int warpDim = 32;
+
+	int numWarps = (m_numVerts + warpDim - 1) / warpDim;
+
+	OMP_PARALLEL_FOR
+
+		for (int warpIdx = 0; warpIdx < numWarps; ++warpIdx)
+		{
+			unsigned int firstInWarp = warpIdx * 32;
+
+			int curr[32];
+			int next[32];
+			int aggLevel[32];
+
+			for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+			{
+				int vid = laneIdx + warpIdx * 32;
+
+				if (vid >= m_numVerts) { break; }
+
+				curr[laneIdx] = vid;
+
+				aggLevel[laneIdx] = m_numLevel - 1;
+			}
+
+			Int4 coarseTable[32];
+
+			for (int l = 0; l < m_numLevel - 1; ++l)
+			{
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * 32;
+
+					if (vid >= m_numVerts) { break; }
+
+					next[laneIdx] = m_goingNext[curr[laneIdx]];
+				}
+
+				Intrinsic::SyncWarp();
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * 32;
+
+					if (vid >= m_numVerts) { break; }
+
+					if (next[laneIdx] == next[0])
+					{
+						aggLevel[laneIdx] = Math::Min(l, aggLevel[laneIdx]);
+					}
+
+					curr[laneIdx] = next[laneIdx];
+					coarseTable[laneIdx][l] = next[laneIdx];
+				}
+			}
+
+
+			for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+			{
+				int vid = laneIdx + warpIdx * 32;
+
+				if (vid >= m_numVerts) { break; }
+
+				m_denseLevel[vid] = aggLevel[laneIdx];
+
+				m_coarseTables[vid] = coarseTable[laneIdx];
+			}
+		}
+}
+
+void SeSchwarzPreconditioner::AdditionalSchwarzHessian2(SeMatrix3f hessian, std::vector<SeMatrix3f>& pAdditionalHessian, SeArray2D<SeMatrix3f>& pDenseHessian, int v1, int v2, const std::vector<int>& pGoingNext, int nLevel, int vNum)
+{
+	int level = 0;
+	unsigned int myID = v1;
+	unsigned int otID = v2;
+	const unsigned int bank = 32;
+
+	while (myID / bank != otID / bank && level < nLevel)
+	{
+		myID = pGoingNext[myID];
+		otID = pGoingNext[otID];
+		level++;
+	}
+
+	if (level >= nLevel)
+		return;
+
+	Intrinsic::AtomicAdd(&pDenseHessian[otID % bank][myID], hessian);
+	Intrinsic::AtomicAdd(&pDenseHessian[myID % bank][otID], hessian);
+
+	if (level < nLevel - 1)
+	{
+		myID = pGoingNext[myID];
+		otID = pGoingNext[otID];
+
+		if (myID == otID)
+		{
+			Intrinsic::AtomicAdd(&pAdditionalHessian[myID], hessian * 2.0f);
+		}
+		else
+		{
+			Intrinsic::AtomicAdd(&pAdditionalHessian[myID], hessian);
+			Intrinsic::AtomicAdd(&pAdditionalHessian[otID], hessian);
+		}
+	}
+}
+
+void SeSchwarzPreconditioner::PrepareCollisionHessian()
+{
+	OMP_PARALLEL_FOR
+
+		for (int i = 0; i < m_stencilNum; i++)
+		{
+			const auto& s = m_stencils[i];
+			auto idx = m_stencilIndexMapped[i];
+
+			Float3 d(s.direction[0], s.direction[1], s.direction[2]);
+
+			SeMatrix3f hessian = OuterProduct(d, d * s.stiff);
+
+			for (int it = 0; it < s.verextNumPerStencil; it++)
+			{
+				Intrinsic::AtomicAdd(&m_additionalHessian32[idx[it]], hessian * Math::Square(s.weight[it]));
+			}
+
+			for (int ita = 0; ita < s.verextNumPerStencil; ita++)
+			{
+				for (int itb = ita + 1; itb < s.verextNumPerStencil; itb++)
+				{
+					AdditionalSchwarzHessian2(s.weight[ita] * s.weight[itb] * hessian, m_additionalHessian32, m_hessian32, idx[ita], idx[itb], m_goingNext, m_numLevel, m_numVerts);
+				}
+			}
+		}
+}
+
+void SeSchwarzPreconditioner::PrepareHessian(const SeMatrix3f* diagonal, const SeMatrix3f* csrOffDiagonals, const int* csrRanges)
+{
+
+	const unsigned int bank = 32;
+
+	int nVC = (m_numVerts + 31) / 32 * 32;
+
+	OMP_PARALLEL_FOR
+
+		for (int vid = nVC; vid < m_totalNumberClusters; ++vid)
+		{
+			auto oldDiagonal = m_additionalHessian32[vid];
+			int myID = vid;
+			Intrinsic::AtomicAdd(&m_hessian32[myID % bank][myID], oldDiagonal);
+			while (true)
+			{
+				myID = m_goingNext[myID];
+				if (myID >= m_totalNumberClusters)
+				{
+					break;
+				}
+				Intrinsic::AtomicAdd(&m_hessian32[myID % bank][myID], oldDiagonal);
+			}
+		}
+
+	int nblock = (m_numVerts + 31) / 32;
+	OMP_PARALLEL
+	{
+		std::vector<std::unordered_map<int,SeMatrix3f>> diagTable(m_numLevel);
+		
+#pragma omp for
+		for (int bid = 0; bid < nblock; ++bid)
+		{
+			for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+			{
+				int vid = laneIdx + bid * 32;
+				if (vid >= m_numVerts)
+					break;
+				const int vidOrig = m_MapperSortedGetOriginal[vid];
+				int oldNum = m_mappedNeighborsNum[vid];
+
+				auto oldDiagonal = diagonal[vidOrig] + m_additionalHessian32[vid];
+				m_hessian32[vid % bank][vid] += oldDiagonal;   // self diagonal
+
+				for (int k = 1; k < oldNum; k++)
+				{
+					const unsigned int neighbor = m_mappedNeighbors[k][vid];
+					SeMatrix3f mat = csrOffDiagonals[csrRanges[vidOrig] + k - 1];
+					int level = 0;
+					unsigned int levelSz = m_numVerts;
+					unsigned int myID = vid;
+					unsigned int otID = neighbor;
+
+					while (myID / bank != otID / bank && level < m_numLevel)
+					{
+						level++;
+						myID = m_goingNext[myID];
+						otID = m_goingNext[otID];
+					}
+					if (level >= m_numLevel)
+					{
+						continue;
+					}
+					if (level <= 1) // ����block�ֲ��У�level 1��λ��Ҳһ�����ڱ��߳�
+						m_hessian32[otID % bank][myID] += mat;
+					else
+						Intrinsic::AtomicAdd(&m_hessian32[otID % bank][myID], mat);
+
+					if (level == 0)
+						oldDiagonal += mat;
+					else if (level + 1 < m_numLevel)
+					{
+						myID = m_goingNext[myID];
+						auto& table = diagTable[level + 1];
+						if (table.find(myID) == table.end())
+							table[myID] = mat;
+						else
+							table[myID] += mat;
+					}
+				}
+				if (1 < m_numLevel)
+				{
+					int myID = m_goingNext[vid];
+					m_hessian32[myID % bank][myID] += oldDiagonal;   // self diagonal
+					if (2 < m_numLevel)
+					{
+						myID = m_goingNext[myID];
+						auto& table = diagTable[2];
+						if (table.find(myID) == table.end())
+							table[myID] = oldDiagonal;
+						else
+							table[myID] += oldDiagonal;
+					}
+				}
+			}
+		}
+
+		for (int lv = 2; lv < m_numLevel; lv++)
+		{
+			const auto& table = diagTable[lv];
+			for (auto& pair : table)
+			{
+				int myID = pair.first;
+				Intrinsic::AtomicAdd(&m_hessian32[myID % bank][myID],pair.second);
+				if (lv + 1 < m_numLevel)
+				{
+					myID = m_goingNext[myID];
+					auto& nexttable = diagTable[lv + 1];
+					if (nexttable.find(myID) == nexttable.end())
+						nexttable[myID] = pair.second;
+					else
+						nexttable[myID] += pair.second;
+				}
+			}
+		}
+	}
+}
+
+void SeSchwarzPreconditioner::LDLtInverse512()
+{
+	const int triSz = (1 + 96) * 96 / 2 + 16 * 3;
+
+	const int activeblockNum = m_totalNumberClusters / 32;
+
+	OMP_PARALLEL_FOR
+
+		for (int block = 0; block < activeblockNum; block++)
+		{
+			float A[96][96] = {};
+
+			for (int x = 0; x < 32; x++)
+			{
+				for (int y = 0; y < 32; y++)
+				{
+					SeMatrix3f temp = m_hessian32[y][x + block * 32];
+
+					if (x == y && temp(0, 0) == 0.0f)
+					{
+						temp = SeMatrix3f::Identity();
+					}
+					for (int ii = 0; ii < 3; ii++)
+					{
+						for (int jj = 0; jj < 3; jj++)
+						{
+							A[x * 3 + ii][y * 3 + jj] = temp(ii, jj);
+						}
+					}
+				}
+			}
+
+			//const int pt = -1;//212;
+			//if (block == pt)
+			//{
+			//	std::ofstream ofs("c:\\test\\MRF.txt");
+			//	for (int y = 0; y < 96; y++)
+			//	{
+			//		for (int x = 0; x < 96; x++)
+			//		{
+			//			ofs << A[y][x] << " ";
+			//		}
+			//		ofs << std::endl;
+			//	}
+			//}
+
+			// ������Ԫ
+#ifdef WIN32
+			for (int x = 0; x < 96; x++)
+			{
+				float diag = A[x][x];
+				__m256 line[12];
+				for (int it = 0; it < 12; it++)
+					line[it] = _mm256_loadu_ps(A[x] + it * 8);
+
+				for (int y = x + 1; y < 96; y++)
+				{
+					if (A[y][x] == 0.0f)
+						continue;
+					float r = -A[y][x] / diag;
+					__m256 ratio = _mm256_set1_ps(r);
+					for (int it = 0; it < 12; it++)
+					{
+						__m256 temp = _mm256_fmadd_ps(ratio, line[it], _mm256_loadu_ps(A[y] + it * 8)); // A[y] += A[x] * ratio 
+						_mm256_storeu_ps(A[y] + it * 8, temp);
+					}
+					A[y][x] = r;
+				}
+			}
+
+
+			// inv diagonal 
+			float diagonal[96] = {};
+			for (int y = 0; y < 96; y++)
+			{
+				diagonal[95 - y] = A[y][y];
+				A[y][y] = 1.0f;
+				for (int x = y + 1; x < Math::Min(96, y + 9); x++)
+				{
+					A[y][x] = 0.0f;
+				}
+			}
+			for (int it = 0; it < 12; it++)
+			{
+				__m256 temp = _mm256_div_ps(_mm256_set1_ps(1.0f), _mm256_loadu_ps(diagonal + it * 8));
+				_mm256_storeu_ps(diagonal + it * 8, temp);
+			}
+
+			int off = block * triSz;
+			// output diagonal 
+			for (int it = 0; it < 12; it++)
+			{
+				int lc = 96 - it * 8;
+				__m256 acc = _mm256_setzero_ps();
+				for (int l = 0; l < lc; l++)
+				{
+					__m256 a = _mm256_loadu_ps(A[95 - l] + it * 8);
+					a = _mm256_mul_ps(a, a);
+					__m256 r = _mm256_set1_ps(diagonal[l]);
+					acc = _mm256_fmadd_ps(r, a, acc);
+				}
+				_mm256_storeu_ps(&m_invSymR[off + it * 8], acc);
+			}
+			off += 12 * 8;
+			for (int it = 0; it < 12; it++)
+			{
+				int xBg = it * 8;
+				for (int scan = xBg + 1; scan <= (96 - 8); scan++)
+				{
+					int lc = 96 - scan;
+					__m256 acc = _mm256_setzero_ps();
+					for (int l = 0; l < lc; l++)
+					{
+						__m256 a = _mm256_loadu_ps(A[95 - l] + xBg);
+						__m256 b = _mm256_loadu_ps(A[95 - l] + scan);
+						a = _mm256_mul_ps(a, b);
+						__m256 r = _mm256_set1_ps(diagonal[l]);
+						acc = _mm256_fmadd_ps(r, a, acc);
+					}
+					_mm256_storeu_ps(&m_invSymR[off], acc);
+					off += 8;
+				}
+			}
+#else
+            //FIXME!
+            float diagonal[96] = {};
+            int off = block * triSz;
+#endif
+			for (int it = 0; it < 12; it++)
+			{
+				for (int lane = 0; lane < 7; lane++)
+				{
+					int xBg = it * 8 + lane;
+					for (int h = (96 - 7 + lane); h < 96; h++)
+					{
+						int lc = 96 - h;
+						float acc = 0.f;
+						for (int l = 0; l < lc; l++)
+						{
+							float a = A[95 - l][xBg];
+							float b = A[95 - l][h];
+							float r = diagonal[l];
+							acc += a * b * r;
+						}
+						m_invSymR[off] = acc;
+						off++;
+					}
+				}
+			}
+
+			//if (block == pt)
+			//{
+			//	std::ofstream ofs("c:\\test\\MRF3.txt");
+			//	float B[96][96] = {};
+			//	int Noff = block * triSz;
+			//	for (int i = 0; i < 96; i++)
+			//	{
+			//		B[i][i] = m_invSymR[i + Noff];
+			//	}
+			//	Noff += 96;
+			//  // body
+			//	for (int it = 0; it < 12; it++)
+			//	{
+			//		int xBg = it * 8;
+			//		for (int scan = xBg + 1; scan <= (96 - 8); scan++)
+			//		{
+			//			for (int l = 0; l < 8; l++)
+			//			{
+			//				float value = m_invSymR[Noff + l];
+			//				B[scan + l][xBg + l] = value;
+			//			}
+			//			Noff += 8;
+			//		}
+			//	}
+			//  // remainder
+			//	for (int it = 0; it < 12; it++)
+			//	{
+			//		for (int lane = 0; lane < 7; lane++)
+			//		{
+			//			int xBg = it * 8 + lane;
+			//			for (int h = (96 - 7 + lane); h < 96; h++)
+			//			{
+			//				float value = m_invSymR[Noff];
+			//				Noff++;
+			//				B[h][xBg] = value;
+			//			}
+			//		}
+			//	}
+			//
+			//	for (int y = 0; y < 96; y++)
+			//	{
+			//		for (int x = 0; x < 96; x++)
+			//		{
+			//			ofs << B[y][x] << " ";
+			//		}
+			//		ofs << std::endl;
+			//	}
+			//}
+		}
+}
+
+void SeSchwarzPreconditioner::BuildResidualHierarchy(const SeVec3fSimd* m_cgResidual)
+{
+	int nblock = (m_numVerts + 31) / 32;
+
+	OMP_PARALLEL
+	{
+		std::vector<std::unordered_map<int,SeVec3fSimd>> diagTable(m_numLevel);
+		
+#pragma omp for
+
+		for (int bid = 0; bid < nblock; ++bid)
+		{
+			for (int lane = 0; lane < 32; lane++)
+			{
+				int vid = lane + bid * 32;
+				if (vid >= m_numVerts)
+					break;
+				unsigned int vidOrig = m_MapperSortedGetOriginal[vid];
+				SeVec3fSimd r = m_cgResidual[vidOrig];
+				m_mappedR[vid] = r;
+				if (1 < m_numLevel)
+				{
+					int next = m_goingNext[vid];
+					m_mappedR[next] += r;
+				}
+			}
+		}
+	}
+
+		if (m_numLevel > 2)
+		{
+			int bg = m_levelSize[1].y;
+			int ed = m_levelSize[2].y;
+			for (int vid = bg; vid < ed; vid++)
+			{
+				SeVec3fSimd r = m_mappedR[vid];
+				int nx = vid;
+				for (int lv = 2; lv < m_numLevel; lv++)
+				{
+					nx = m_goingNext[nx];
+					m_mappedR[nx] += r;
+				}
+			}
+		}
+
+	//printf("level %d\n", m_nLevel);
+
+	//for (int l = 0; l < m_nLevel + 1; l++)
+	//	printf("%d ",m_levelSize[l].y);
+		//printf("\n");
+}
+
+void SeSchwarzPreconditioner::SchwarzLocalXSym()
+{
+	constexpr int blockDim = 32;
+
+	const int activeblockNum = m_totalNumberClusters / blockDim;
+
+	const int triSz = (1 + 96) * 96 / 2 + 16 * 3;
+
+	OMP_PARALLEL_FOR
+
+		for (int blockIdx = 0; blockIdx < activeblockNum; ++blockIdx)
+		{
+			float cacheRhs[96];
+			float cacheOut[96];
+			for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+			{
+				int vid = laneIdx + blockIdx * blockDim;
+				auto rhs = m_mappedR[vid];
+				cacheRhs[laneIdx * 3 + 0] = rhs.x;
+				cacheRhs[laneIdx * 3 + 1] = rhs.y;
+				cacheRhs[laneIdx * 3 + 2] = rhs.z;
+			}
+#ifdef WIN32
+			// ------------------------------------
+			// diagnal
+			int off = blockIdx * triSz;
+			for (int it = 0; it < 12; it++)
+			{
+				__m256 diag = _mm256_loadu_ps(&m_invSymR[off + it * 8]);
+				__m256 diagRhs = _mm256_loadu_ps(&cacheRhs[it * 8]);
+				diag = _mm256_mul_ps(diag, diagRhs);
+				_mm256_storeu_ps(&cacheOut[it * 8], diag);
+			}
+			off += 8 * 12;
+			for (int it = 0; it < 11; it++)
+			{
+				int xBg = it * 8;
+				__m256 sigularRhs = _mm256_loadu_ps(&cacheRhs[xBg]);
+				__m256 sigularResult = _mm256_setzero_ps();
+				for (int scan = xBg + 1; scan <= (96 - 8); scan++)
+				{
+					__m256 mtx = _mm256_loadu_ps(&m_invSymR[off]);
+					off += 8;
+
+					__m256 scanRhs = _mm256_loadu_ps(&cacheRhs[scan]);
+					sigularResult = _mm256_fmadd_ps(mtx, scanRhs, sigularResult);
+
+					__m256 scanResult = _mm256_loadu_ps(&cacheOut[scan]);
+					scanResult = _mm256_fmadd_ps(mtx, sigularRhs, scanResult);
+					_mm256_storeu_ps(&cacheOut[scan], scanResult);
+				}
+
+				__m256 sigularResultOrg = _mm256_loadu_ps(&cacheOut[xBg]);
+				sigularResult = _mm256_add_ps(sigularResultOrg, sigularResult);
+				_mm256_storeu_ps(&cacheOut[xBg], sigularResult);
+			}
+#else
+            //FIXME!
+            int off = blockIdx * triSz;
+#endif
+
+			for (int it = 0; it < 12; it++)
+			{
+				for (int lane = 0; lane < 7; lane++)
+				{
+					int xBg = it * 8 + lane;
+					float sigularRhs = cacheRhs[xBg];
+					float singularResult = 0.0f;
+					for (int h = (96 - 7 + lane); h < 96; h++)
+					{
+						float value = m_invSymR[off];
+						off++;
+						if (value)
+						{
+							singularResult += value * cacheRhs[h];
+							cacheOut[h] += value * sigularRhs;
+						}
+					}
+					cacheOut[xBg] += singularResult;
+				}
+			}
+
+
+			for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+			{
+				int vid = laneIdx + blockIdx * blockDim;
+
+				SeVec3fSimd out(0.0f);
+				out.x = cacheOut[laneIdx * 3 + 0];
+				out.y = cacheOut[laneIdx * 3 + 1];
+				out.z = cacheOut[laneIdx * 3 + 2];
+
+				m_mappedZ[vid] = out;
+			}
+
+		}
+}
+
+void SeSchwarzPreconditioner::CollectFinalZ(SeVec3fSimd* m_cgZ)
+{
+	OMP_PARALLEL_FOR
+
+		for (int vid = 0; vid < m_numVerts; ++vid)
+		{
+			unsigned int mappedIndex = m_MapperSortedGetOriginal[vid];
+
+			auto z = m_mappedZ[vid];
+
+			Int4 table = m_coarseTables[vid];
+
+			for (int l = 1; l < Math::Min(m_numLevel, 4); l++)
+			{
+				int now = table[l - 1];
+
+				z += m_mappedZ[now];
+			}
+
+			m_cgZ[mappedIndex] = z;
+		}
+}
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeSchwarzPreconditioner.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeSchwarzPreconditioner.h
new file mode 100644
index 0000000..01f7adf
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeSchwarzPreconditioner.h
@@ -0,0 +1,180 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "SeCsr.h"
+#include "SeMorton.h"
+#include "SeAabbSimd.h"
+#include "SeCollisionElements.h"
+
+
+SE_NAMESPACE_BEGIN
+
+class SeSchwarzPreconditioner
+{
+
+public:
+	
+	//==== input data
+
+	const SeVec3fSimd* m_positions = nullptr;		// the mesh nodal positions
+
+	//==== input mesh topology data for collision computation
+
+	const Int4* m_edges = nullptr;				// indices of the two adjacent vertices and the two opposite vertices of edges
+	const Int4* m_faces = nullptr;				// indices of the three adjacent vertices of faces, the fourth is useless
+	
+	const SeCsr<int>* m_neighbours = nullptr;	// the adjacent information of vertices stored in a csr format
+
+public:
+
+	//==== call before time integration once a frame
+	void AllocatePrecoditioner(int numVerts, int numEdges, int numFaces);
+
+	//==== call before PCG iteration loop
+	void PreparePreconditioner(const SeMatrix3f* diagonal, const SeMatrix3f* csrOffDiagonals, const int* csrRanges,
+		const EfSet* efSets, const EeSet* eeSets, const VfSet* vfSets, unsigned int* efCounts, unsigned int* eeCounts, unsigned int* vfCounts);
+
+	//==== call during PCG iterations
+	void Preconditioning(SeVec3fSimd* z, const SeVec3fSimd* residual, int dim);
+
+private:
+
+	int m_numVerts = 0;
+	int m_numEdges = 0;
+	int m_numFaces = 0;
+
+	int m_frameIndex = 0;
+
+	int m_totalSz = 0;
+	int m_numLevel = 0;
+
+	int m_totalNumberClusters;
+
+	
+	SeAabb<SeVec3fSimd>					m_aabb;
+
+
+	SeArray2D<SeMatrix3f>				m_hessianMapped;
+	std::vector<int>					m_mappedNeighborsNum;
+	std::vector<int>					m_mappedNeighborsNumRemain;
+	SeArray2D<int>						m_mappedNeighbors;
+	SeArray2D<int>						m_mappedNeighborsRemain;
+
+	SeArray2D<int>						m_CoarseSpaceTables;
+	std::vector<int>                    m_prefixOrignal;
+	std::vector<unsigned int>           m_fineConnectMask;
+	std::vector<unsigned int>           m_nextConnectMsk;
+	std::vector<unsigned int>           m_nextPrefix;
+
+	std::vector<SeVec3fSimd>			m_mappedPos;
+
+	std::vector<Int4>					m_coarseTables;
+	std::vector<int>					m_goingNext;
+	std::vector<int>					m_denseLevel;
+	std::vector<Int2>					m_levelSize;  // .x = current level size    .y = (prefixed) current level begin index
+
+	std::vector<SeVec3fSimd>			m_mappedR;
+	std::vector<SeVec3fSimd>			m_mappedZ;
+	std::vector<int>                    m_MapperSortedGetOriginal;          // sorted by morton
+	std::vector<int>                    m_mapperOriginalGetSorted;
+	std::vector<SeMorton64>             m_mortonCode;
+
+	SeArray2D<SeMatrix3f>				m_hessian32;
+	std::vector<SeMatrix3f>             m_additionalHessian32;
+	std::vector<float>                  m_invSymR;
+
+	
+	int m_stencilNum;
+	int m_maxStencilNum;
+	std::vector<Stencil>				m_stencils;
+	std::vector<Int5>					m_stencilIndexMapped;
+
+private:
+
+	void ComputeLevelNums(int bankSize);
+
+	void DoAlllocation();
+
+	void ComputeTotalAABB();
+
+	void ComputeAABB();
+
+	void SpaceSort();
+
+	void FillSortingData();
+
+	void DoingSort();
+
+	void ComputeInverseMapper();
+
+	void MapHessianTable();
+
+	void PrepareCollisionStencils(const EfSet* efSets, const EeSet* eeSets, const VfSet* vfSets, unsigned int* efCounts, unsigned int* eeCounts, unsigned int* vfCounts);
+
+	void MapCollisionStencilIndices();
+
+	void ReorderRealtime();
+
+
+
+	void BuildConnectMaskL0();
+
+	void BuildCollisionConnection(unsigned int* pConnect, const int* pCoarseTable);
+
+	void PreparePrefixSumL0();
+
+	void BuildLevel1();
+
+	void BuildConnectMaskLx(int level);
+
+	void NextLevelCluster(int level);
+
+	void PrefixSumLx(int level);
+
+	void ComputeNextLevel(int level);
+
+	void TotalNodes();
+
+	void AggregationKernel();
+
+	void AdditionalSchwarzHessian2(SeMatrix3f hessian, std::vector<SeMatrix3f>& pAdditionalHessian, SeArray2D<SeMatrix3f>& pDenseHessian, int v1, int v2, const std::vector<int>& pGoingNext, int nLevel, int vNum);
+
+	void PrepareCollisionHessian();
+
+	void PrepareHessian(const SeMatrix3f* diagonal, const SeMatrix3f* csrOffDiagonals, const int* csrRanges);
+	
+	void LDLtInverse512();
+
+	void BuildResidualHierarchy(const SeVec3fSimd* m_cgResidual);
+
+	void SchwarzLocalXSym();
+
+	void CollectFinalZ(SeVec3fSimd* m_cgZ);
+};
+
+SE_NAMESPACE_END
\ No newline at end of file
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeSchwarzPreconditionerPreviousVersion.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeSchwarzPreconditionerPreviousVersion.h
new file mode 100644
index 0000000..e45950d
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeSchwarzPreconditionerPreviousVersion.h
@@ -0,0 +1,2014 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "SeCsr.h"
+#include "SeMorton.h"
+#include "SeAabbSimd.h"
+#include "SeIntrinsic.h"
+#include "SeCollisionElements.h"
+
+#include <algorithm>
+
+SE_NAMESPACE_BEGIN
+
+class SeSchwarzPreconditionerPreviousVersion
+{
+
+public:
+
+	//==== input data
+
+	const SeVec3fSimd* m_positions = nullptr;		// the mesh nodal positions
+
+	//==== input mesh topology data for collision computation
+
+	const Int4* m_edges = nullptr;				// indices of the two adjacent vertices and the two opposite vertices of edges
+	const Int4* m_faces = nullptr;				// indices of the three adjacent vertices of faces, the fourth is useless
+
+	const SeCsr<int>* m_neighbours = nullptr;	// the adjacent information of vertices stored in a csr format
+
+public:
+
+	//==== call before time integration once a frame
+	void AllocatePrecoditioner(int numVerts, int numEdges, int numFaces)
+	{
+		m_numVerts = numVerts;
+		m_numEdges = numEdges;
+		m_numFaces = numFaces;
+
+		if (m_frameIndex == 0)
+		{
+			DoAlllocation();
+		}
+
+		if (m_frameIndex % 17 != 0) { return; }
+
+		//====
+
+		ComputeAABB();
+
+		FillSortingData();
+
+		DoingSort();
+
+		ComputeInverseMapper();
+
+		MapHessianTable();
+
+		m_frameIndex++;
+	}
+
+	//==== call before PCG iteration loop
+	void PreparePreconditioner(const SeMatrix3f* diagonal, const SeMatrix3f* csrOffDiagonals, const int* csrRanges,
+		const EfSet* efSets, const EeSet* eeSets, const VfSet* vfSets, unsigned int* efCounts, unsigned int* eeCounts, unsigned int* vfCounts)
+	{
+		m_mappedNeighborsRemain = m_mappedNeighbors;
+		m_mappedNeighborsNumRemain = m_mappedNeighborsNum;
+
+		//==== before reordering
+
+		PrepareCollisionStencils(efSets, eeSets, vfSets, efCounts, eeCounts, vfCounts);
+
+		//====
+
+		ReorderRealtime();
+
+		//====
+
+		const int blockNum = m_totalSz / 32;
+		Utility::MemsetZero(m_invSymR);
+		Utility::MemsetZero(m_hessian32.Ptr(), m_hessian32.Size());
+		Utility::MemsetZero(m_additionalHessian32);
+
+		//====
+
+		PrepareCollisionHessian();
+
+		PrepareHessian(diagonal, csrOffDiagonals, csrRanges);
+
+		LDLtInverse512();
+	}
+
+	//==== call during PCG iterations
+	void Preconditioning(SeVec3fSimd* z, const SeVec3fSimd* residual, int dim)
+	{
+		Utility::MemsetZero(m_mappedZ);
+		Utility::MemsetZero(m_mappedR);
+
+		BuildResidualHierarchy(residual);
+
+		SchwarzLocalXSym();
+
+		CollectFinalZ(z);
+	}
+
+private:
+
+	int m_numVerts = 0;
+	int m_numEdges = 0;
+	int m_numFaces = 0;
+
+	int m_frameIndex = 0;
+
+	int m_totalSz = 0;
+	int m_numLevel = 0;
+
+	int m_totalNumberClusters;
+
+	
+	SeAabb<SeVec3fSimd> m_aabb;
+
+
+	SeArray2D<SeMatrix3f>				m_hessianMapped;
+	std::vector<int>					m_mappedNeighborsNum;
+	std::vector<int>					m_mappedNeighborsNumRemain;
+	SeArray2D<int>						m_mappedNeighbors;
+	SeArray2D<int>						m_mappedNeighborsRemain;
+
+	SeArray2D<int>						m_CoarseSpaceTables;
+	std::vector<int>                    m_prefixOrignal;
+	std::vector<unsigned int>           m_fineConnectMask;
+	std::vector<unsigned int>           m_nextConnectMsk;
+	std::vector<unsigned int>           m_nextPrefix;
+
+	std::vector<SeVec3fSimd>			m_mappedPos;
+
+	std::vector<Int4>					m_coarseTables;
+	std::vector<int>					m_goingNext;
+	std::vector<int>					m_denseLevel;
+	std::vector<Int2>					m_levelSize;  // .x = current level size    .y = (prefixed) current level begin index
+
+	std::vector<SeVec3fSimd>			m_mappedR;
+	std::vector<SeVec3fSimd>			m_mappedZ;
+	std::vector<int>                    m_MapperSortedGetOriginal;          // sorted by morton
+	std::vector<int>                    m_mapperOriginalGetSorted;
+	std::vector<SeMorton64>             m_mortonCode;
+
+	SeArray2D<SeMatrix3f>				m_hessian32;
+	std::vector<SeMatrix3f>             m_additionalHessian32;
+	std::vector<float>                  m_invSymR;
+
+	int m_stencilNum;
+	int m_maxStencilNum;
+	std::vector<Stencil>				m_stencils;
+	std::vector<Int5>					m_stencilIndexMapped;
+
+private:
+
+	void ComputeLevelNums(int bankSize)
+	{
+		constexpr float SizeRatio = 1.5f;
+
+		//====
+
+		int totalSz = 0;
+
+		int nLevel = 1;
+		int levelSz = (m_numVerts + bankSize - 1) / bankSize * bankSize;
+		totalSz += levelSz;
+
+		while (levelSz > 32)
+		{
+			levelSz /= 32;
+
+			nLevel++;
+			levelSz = (levelSz + bankSize - 1) / bankSize * bankSize;
+			totalSz += levelSz;
+		}
+
+		m_numLevel = nLevel;
+		m_totalSz = totalSz * SizeRatio;
+	}
+
+	void DoAlllocation()
+	{
+		constexpr int bankSize = 32;	//==== size of diagonal block
+
+		ComputeLevelNums(bankSize);
+
+		//====
+
+		m_MapperSortedGetOriginal.resize(m_numVerts);
+		m_mapperOriginalGetSorted.resize(m_numVerts);
+		m_mortonCode.resize(m_numVerts);
+
+		m_hessian32.Resize(bankSize, m_totalSz);
+		m_mappedZ.resize(m_totalSz);
+		m_mappedR.resize(m_totalSz);
+
+		m_mappedPos.resize(m_numVerts);
+
+		m_denseLevel.resize(m_numVerts);
+		m_goingNext.resize(m_numLevel * m_numVerts);
+		m_coarseTables.resize(m_numVerts);
+
+		m_prefixOrignal.resize(m_numVerts);
+		m_fineConnectMask.resize(m_numVerts);
+		m_nextConnectMsk.resize(m_numVerts);
+		m_nextPrefix.resize(m_numVerts);
+
+
+		int triSz = (1 + 96) * 96 / 2 + 16 * 3;
+		const int blockNum = m_totalSz / 32;
+		m_invSymR.resize(triSz * blockNum);
+		m_additionalHessian32.resize(m_totalSz + 1);
+
+		//====
+
+		m_levelSize.resize(m_numLevel + 1);
+		m_CoarseSpaceTables.Resize(m_numLevel, m_numVerts);
+
+		int maxNeighbours = 0;
+
+		for (int vId = 0; vId < m_numVerts; ++vId)
+		{
+			maxNeighbours = Math::Max(m_neighbours->Size(vId) + 1, maxNeighbours);
+		}
+
+		m_mappedNeighborsNum.resize(m_numVerts);
+		m_mappedNeighborsNumRemain.resize(m_numVerts);
+		m_mappedNeighbors.Resize(maxNeighbours, m_numVerts);
+		m_mappedNeighborsRemain.Resize(maxNeighbours, m_numVerts);
+	}
+
+	void ComputeAABB()
+	{
+		m_aabb = SeAabb<SeVec3fSimd>();
+
+		for (int tid = 0; tid < m_numVerts; ++tid)
+		{
+			m_aabb += m_positions[tid];
+		}
+	}
+
+	void FillSortingData()
+	{
+		OMP_PARALLEL_FOR
+
+			for (int tid = 0; tid < m_numVerts; ++tid)
+			{
+				SeVec3fSimd temp = (m_positions[tid] - m_aabb.Lower) / m_aabb.Extent();
+
+				SeMorton64 mt;
+
+				mt.Encode(temp.x, temp.y, temp.z);
+
+				m_mortonCode[tid] = mt;
+
+				m_MapperSortedGetOriginal[tid] = tid;
+			}
+	}
+
+	void DoingSort()
+	{
+		//==== sort m_MapperSortedGetOriginal by m_mortonCode1;
+
+		std::sort(m_MapperSortedGetOriginal.begin(), m_MapperSortedGetOriginal.end(), [&](int i, int j) { return m_mortonCode[i] < m_mortonCode[j]; });
+	}
+
+	void ComputeInverseMapper()
+	{
+		OMP_PARALLEL_FOR
+
+			for (int vid = 0; vid < m_numVerts; ++vid)
+			{
+				int originalIndex = m_MapperSortedGetOriginal[vid];
+
+				m_mapperOriginalGetSorted[originalIndex] = vid;
+			}
+	}
+
+	void MapHessianTable()
+	{
+		OMP_PARALLEL_FOR
+
+			for (int vid = 0; vid < m_numVerts; ++vid)
+			{
+				const auto  originalIndex = m_MapperSortedGetOriginal[vid];
+
+				m_mappedPos[vid] = m_positions[originalIndex];
+
+				//====
+
+				int numNeighbors = m_neighbours->Size(originalIndex);
+
+				const int* neighbors = m_neighbours->IdxPtr(originalIndex);
+
+
+				m_mappedNeighborsNum[vid] = numNeighbors + 1;	// include self
+
+				m_mappedNeighbors[0][vid] = vid;				// include self
+
+				for (int k = 1; k < numNeighbors + 1; ++k)
+				{
+					unsigned int originalNeighbors = neighbors[k - 1];//m_neighbors[k][originalIndex];
+
+					m_mappedNeighbors[k][vid] = m_mapperOriginalGetSorted[originalNeighbors];
+				}
+			}
+	}
+
+	void MapCollisionStencilIndices()
+	{
+		m_stencilIndexMapped.resize(m_stencilNum);
+
+		OMP_PARALLEL_FOR
+
+			for (int i = 0; i < m_stencilNum; i++)
+			{
+				const auto& s = m_stencils[i];
+
+				for (int vi = 0; vi < s.verextNumPerStencil; vi++)
+				{
+					m_stencilIndexMapped[i][vi] = m_mapperOriginalGetSorted[s.index[vi]];
+				}
+			}
+	}
+
+	void PrepareCollisionStencils(const EfSet* efSets, const EeSet* eeSets, const VfSet* vfSets, unsigned int* efCounts, unsigned int* eeCounts, unsigned int* vfCounts)
+	{
+		int efNum = efCounts[m_numEdges];
+		int eeNum = eeCounts[m_numEdges];
+		int vfNum = vfCounts[m_numVerts];
+
+		int totalStencilNum = efNum + eeNum + vfNum;
+
+		if (totalStencilNum > m_maxStencilNum)
+		{
+			totalStencilNum = m_maxStencilNum;
+			printf("stencil size %d exceed max stencils num  %d\n", totalStencilNum, m_maxStencilNum);
+		}
+
+		m_stencilNum = 0;
+
+		OMP_PARALLEL_FOR
+
+			for (int i = 0; i < totalStencilNum; i++)
+			{
+				Stencil s;
+
+				if (i < efNum) //process ef
+				{
+					const auto& pair = efSets[i];
+
+					if (pair.m_eId < 0 || pair.m_fId < 0) { continue; }
+
+					const auto& edge = m_edges[pair.m_eId];
+					const auto& face = m_faces[pair.m_fId];
+
+					s.verextNumPerStencil = 5;
+					s.vertexNumOfFirstPrimitive = 2;
+
+					s.index[0] = edge[0];
+					s.index[1] = edge[1];
+					s.index[2] = face[0];
+					s.index[3] = face[1];
+					s.index[4] = face[2];
+
+					s.weight[0] = pair.m_bary[0];
+					s.weight[1] = 1.f - pair.m_bary[0];
+					s.weight[2] = -pair.m_bary[1];
+					s.weight[3] = -pair.m_bary[2];
+					s.weight[4] = -(1.f - pair.m_bary[1] - pair.m_bary[2]);
+
+					s.direction = pair.m_normal;
+
+					s.stiff = pair.stiff;
+
+				}
+				else if (i < efNum + eeNum) // process ee
+				{
+					const auto& pair = eeSets[i];
+
+					if (pair.m_eId1 < 0 || pair.m_eId0 < 0) { continue; }
+
+					s.verextNumPerStencil = 4;
+					s.vertexNumOfFirstPrimitive = 2;
+
+					const auto& edge0 = m_edges[pair.m_eId0];
+					const auto& edge1 = m_edges[pair.m_eId1];
+
+					s.index[0] = edge0[0];
+					s.index[1] = edge0[1];
+					s.index[2] = edge1[0];
+					s.index[3] = edge1[1];
+
+					s.weight[0] = pair.m_bary[0];
+					s.weight[1] = 1.f - pair.m_bary[0];
+					s.weight[2] = -pair.m_bary[1];
+					s.weight[3] = -(1.f - pair.m_bary[1]);
+
+					s.direction = pair.m_normal;
+
+					s.stiff = pair.stiff;
+				}
+				else if (i < totalStencilNum) //process vf
+				{
+					const auto& pair = vfSets[i];
+
+					if (pair.m_vId < 0 || pair.m_fId < 0) { continue; }
+
+					s.verextNumPerStencil = 4;
+					s.vertexNumOfFirstPrimitive = 3;
+
+					const auto& face = m_faces[pair.m_fId];
+
+					s.index[0] = face[0];
+					s.index[1] = face[1];
+					s.index[2] = face[2];
+					s.index[3] = pair.m_vId;
+
+					s.weight[0] = -pair.m_bary[0];
+					s.weight[1] = -pair.m_bary[1];
+					s.weight[2] = -(1.f - pair.m_bary[2]);
+					s.weight[3] = 1.f;
+
+					s.direction = pair.m_normal;
+
+					s.stiff = pair.stiff;
+				}
+
+				int stencilNum = Intrinsic::AtomicAdd(&m_stencilNum, 1);
+
+				m_stencils[stencilNum] = s;
+			}
+
+		MapCollisionStencilIndices(); //1
+	}
+
+	void ReorderRealtime()
+	{
+		Utility::MemsetZero(m_levelSize);
+
+		BuildConnectMaskL0();
+
+		BuildCollisionConnection(m_fineConnectMask.data(), nullptr); //2
+
+		PreparePrefixSumL0();
+
+		BuildLevel1();
+
+		for (int level = 1; level < m_numLevel; level++)
+		{
+			Utility::MemsetZero(m_nextConnectMsk);
+
+			BuildConnectMaskLx(level);
+
+			BuildCollisionConnection(m_nextConnectMsk.data(), m_CoarseSpaceTables[level - 1]);
+
+			NextLevelCluster(level);
+
+			PrefixSumLx(level);
+
+			ComputeNextLevel(level);
+		}
+
+		TotalNodes();
+
+		AggregationKernel();
+	}
+
+	void BuildConnectMaskL0()
+	{
+		const unsigned int warpDim = 32;
+
+		int nWarps = (m_numVerts + warpDim - 1) / warpDim;
+
+		OMP_PARALLEL_FOR
+
+			for (int warpIdx = 0; warpIdx < nWarps; ++warpIdx)
+			{
+				for (int laneIdx = 0; laneIdx < warpDim; ++laneIdx)
+				{
+					int vId = warpIdx * warpDim + laneIdx;
+
+					if (vId >= m_numVerts)
+					{
+						break;
+					}
+
+					int numNeighbors = m_mappedNeighborsNumRemain[vId];
+
+					unsigned int connetMsk = 1U << laneIdx;	// include self
+
+					int nk = 0;
+
+					for (int k = 0; k < numNeighbors; k++)
+					{
+						int vIdConnected = m_mappedNeighborsRemain[k][vId];
+
+						int warpIdxConnected = vIdConnected / warpDim;
+
+						if (warpIdx == warpIdxConnected) // in the same warp
+						{
+							unsigned int laneIdxConnected = vIdConnected % warpDim;
+
+							connetMsk |= (1U << laneIdxConnected);
+						}
+						else
+						{
+							m_mappedNeighborsRemain[nk][vId] = vIdConnected;
+							nk++;
+						}
+					}
+
+					m_mappedNeighborsNumRemain[vId] = nk;
+
+					// distance cullling
+					//const SeVec3fSimd& pos = m_mappedPos[vId];
+					//for (int it = 0; it < Intrinsic::Popcount32(activeMsk); it++)
+					//{
+					//	const float th = AABB_TH * AABB_TH;
+					//	CuFloat3 po(0.0f);
+					//	po.x = Intrinsic::Shuffle(activeMsk, pos.x, it);
+					//	po.y = Intrinsic::Shuffle(activeMsk, pos.y, it);
+					//	po.z = Intrinsic::Shuffle(activeMsk, pos.z, it);
+					//	if (Math::SqrLength(po - pos) < th)
+					//	{
+					//		connetMsk |= (1U << it);
+					//	}
+					//}
+
+					m_fineConnectMask[vId] = connetMsk;
+				}
+			}
+	}
+
+	void BuildCollisionConnection(unsigned int* pConnect, const int* pCoarseTable)
+	{
+		const int bank = 32;
+
+		OMP_PARALLEL_FOR
+
+			for (int i = 0; i < m_stencilNum; i++)
+			{
+				const auto& s = m_stencils[i];
+				Int5 idx = m_stencilIndexMapped[i];
+				unsigned int connMsk[5] = {};
+
+				if (pCoarseTable)
+				{
+					for (int it = 0; it < s.verextNumPerStencil; it++)
+					{
+						idx[it] = pCoarseTable[idx[it]];
+					}
+				}
+
+				for (int ita = 0; ita < s.verextNumPerStencil; ita++)
+				{
+					for (int itb = ita + 1; itb < s.verextNumPerStencil; itb++)
+					{
+						unsigned int myID = idx[ita];
+						unsigned int otID = idx[itb];
+						if (myID == otID) // redundant ?
+						{
+							continue;
+						}
+						if ((myID / bank == otID / bank))
+						{
+							if (ita < s.vertexNumOfFirstPrimitive && itb >= s.vertexNumOfFirstPrimitive)
+							{
+								connMsk[ita] |= (1U << (otID % bank));
+								connMsk[itb] |= (1U << (myID % bank));
+							}
+						}
+					}
+				}
+
+				for (int it = 0; it < s.verextNumPerStencil; it++)
+				{
+					if (connMsk[it])
+					{
+						Intrinsic::AtomicOr(&pConnect[idx[it]], connMsk[it]);
+					}
+				}
+			}
+	}
+
+	void PreparePrefixSumL0()
+	{
+		int warpDim = 32;
+
+		int nWarps = (m_numVerts + warpDim - 1) / warpDim;
+
+		OMP_PARALLEL_FOR
+
+			for (int warpIdx = 0; warpIdx < nWarps; ++warpIdx)
+			{
+				unsigned int cacheMask[32];
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vId = laneIdx + warpIdx * warpDim;
+
+					if (vId >= m_numVerts) { break; }
+
+					cacheMask[laneIdx] = m_fineConnectMask[vId];
+				}
+
+				Intrinsic::SyncWarp();
+
+				int prefixSum = 0;
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * warpDim;
+
+					if (vid >= m_numVerts) { break; }
+
+					unsigned int connetMsk = cacheMask[laneIdx];
+
+					unsigned int visited = (1U << laneIdx);
+
+					while (connetMsk != -1)
+					{
+						unsigned int todo = visited ^ connetMsk;
+
+						if (!todo)
+						{
+							break;
+						}
+
+						unsigned int nextVisit = Intrinsic::Ffs(todo) - 1;
+
+						visited |= (1U << nextVisit);
+
+						connetMsk |= cacheMask[nextVisit];
+					}
+
+					m_fineConnectMask[vid] = connetMsk;
+
+					unsigned int electedPrefix = Intrinsic::Popcount32(connetMsk & Intrinsic::LanemaskLt(laneIdx));
+
+					if (electedPrefix == 0)
+					{
+						prefixSum++;
+					}
+				}
+
+				m_prefixOrignal[warpIdx] = prefixSum;
+			}
+	}
+
+	void BuildLevel1()
+	{
+		constexpr unsigned int blockDim = 1024;
+
+		int nBlocks = (m_numVerts + blockDim - 1) / blockDim;
+
+		OMP_PARALLEL_FOR
+
+			for (int blockIdx = 0; blockIdx < nBlocks; ++blockIdx)
+			{
+				unsigned int warpPrefix[32];
+
+				unsigned int theBlockGlobalPrefix = 0;
+
+				unsigned int globalWarpOffset = blockIdx * 32;
+
+				for (int warpIdx = 0; warpIdx < blockIdx; ++warpIdx)
+				{
+					for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						unsigned int threadIdx = laneIdx + warpIdx * 32;
+
+						theBlockGlobalPrefix += m_prefixOrignal[threadIdx];
+					}
+				}
+
+				for (int warpIdx = 0; warpIdx < 32; ++warpIdx)
+				{
+					warpPrefix[warpIdx] = m_prefixOrignal[globalWarpOffset + warpIdx];
+				}
+
+				Intrinsic::SyncBlock();
+
+				for (unsigned int warpIdx = 0; warpIdx < 32; ++warpIdx)
+				{
+					unsigned int theWarpGlobalPrefix = theBlockGlobalPrefix;
+
+					for (unsigned int prevWarpIdx = 0; prevWarpIdx < warpIdx; ++prevWarpIdx)
+					{
+						theWarpGlobalPrefix += warpPrefix[prevWarpIdx];
+					}
+
+
+					unsigned int electedMask = 0;
+
+					for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						unsigned int threadIdx = laneIdx + warpIdx * 32;
+
+						int vid = threadIdx + blockIdx * blockDim;
+
+						if (vid >= m_numVerts) { break; }
+
+
+						if (vid == m_numVerts - 1)
+						{
+							m_levelSize[1].x = theWarpGlobalPrefix + warpPrefix[warpIdx];
+							m_levelSize[1].y = (m_numVerts + 31) / 32 * 32;
+						}
+
+						unsigned int connMsk = m_fineConnectMask[vid];
+
+						unsigned int electedPrefix = Intrinsic::Popcount32(connMsk & Intrinsic::LanemaskLt(laneIdx));
+
+						if (electedPrefix == 0)
+						{
+							electedMask |= 1U << laneIdx;
+						}
+						//unsigned int electedMask = Intrinsic::BallotSync(-1, electedPrefix == 0);
+					}
+
+
+					unsigned int lanePrefix[32];
+
+					for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						unsigned int threadIdx = laneIdx + warpIdx * 32;
+
+						int vid = threadIdx + blockIdx * blockDim;
+
+						if (vid >= m_numVerts) { break; }
+
+
+						lanePrefix[laneIdx] = Intrinsic::Popcount32(electedMask & Intrinsic::LanemaskLt(laneIdx));
+
+						lanePrefix[laneIdx] += theWarpGlobalPrefix;
+					}
+
+
+					for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						unsigned int threadIdx = laneIdx + warpIdx * 32;
+
+						int vid = threadIdx + blockIdx * blockDim;
+
+						if (vid >= m_numVerts) { break; }
+
+
+						unsigned int connMsk = m_fineConnectMask[vid];
+
+						unsigned int elected_lane = Intrinsic::Ffs(connMsk) - 1;
+
+						unsigned int theLanePrefix = lanePrefix[elected_lane];
+
+						m_CoarseSpaceTables[0][vid] = theLanePrefix;
+
+						m_goingNext[vid] = theLanePrefix + (m_numVerts + 31) / 32 * 32;
+					}
+				}
+			}
+	}
+
+	void BuildConnectMaskLx(int level)
+	{
+		constexpr unsigned int warpDim = 32;
+
+		int nWarps = (m_numVerts + warpDim - 1) / warpDim;
+
+		const unsigned int bank = 32;
+
+		OMP_PARALLEL_FOR
+
+			for (int warpIdx = 0; warpIdx < nWarps; ++warpIdx)
+			{
+				unsigned int prefixMsk[32];
+				unsigned int connectMsk[32];
+
+				for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * 32;
+
+					if (vid >= m_numVerts) { break; }
+
+					prefixMsk[laneIdx] = m_fineConnectMask[vid];
+
+					unsigned int coarseVid = m_CoarseSpaceTables[level - 1][vid];
+
+					connectMsk[laneIdx] = 0;// 1U << (coarseVid % bank);	//==== include self
+
+					unsigned int kn = m_mappedNeighborsNumRemain[vid];
+
+					unsigned int nk = 0;
+
+					for (unsigned int k = 0; k < kn; k++)
+					{
+						unsigned int connect = m_mappedNeighborsRemain[k][vid];
+
+						unsigned int coarseConnect = m_CoarseSpaceTables[level - 1][connect];
+
+						if (coarseVid / bank == coarseConnect / bank)
+						{
+							unsigned int off = coarseConnect % bank;
+
+							connectMsk[laneIdx] |= (1U << off);
+						}
+						else
+						{
+							m_mappedNeighborsRemain[nk][vid] = connect;
+							nk++;
+						}
+					}
+
+					m_mappedNeighborsNumRemain[vid] = nk;
+				}
+
+				bool isFullWarp = (prefixMsk[0] == -1);
+
+				if (isFullWarp)
+				{
+					unsigned int connectMskFull = 0;
+
+					for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32;
+
+						if (vid >= m_numVerts) { break; }
+
+						connectMskFull |= connectMsk[laneIdx];
+					}
+					for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32;
+
+						if (vid >= m_numVerts) { break; }
+
+						connectMsk[laneIdx] = connectMskFull;
+					}
+				}
+				else
+				{
+					unsigned int cacheMsk[32];
+
+					for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						cacheMsk[laneIdx] = 0;
+					}
+
+					unsigned int electedLane[32];
+
+					for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32;
+
+						if (vid >= m_numVerts) { break; }
+
+						electedLane[laneIdx] = Intrinsic::Ffs(prefixMsk[laneIdx]) - 1;
+
+						if (connectMsk[laneIdx])
+						{
+							//cacheMsk[electedLane[laneIdx]] |= connectMsk[laneIdx];
+							Intrinsic::AtomicOr(&cacheMsk[electedLane[laneIdx]], connectMsk[laneIdx]);
+						}
+					}
+
+					for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32;
+
+						if (vid >= m_numVerts) { break; }
+
+						connectMsk[laneIdx] = cacheMsk[electedLane[laneIdx]];
+					}
+				}
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * 32;
+
+					if (vid >= m_numVerts) { break; }
+
+					unsigned int coarseVid = m_CoarseSpaceTables[level - 1][vid];
+
+					unsigned int electedPrefix = Intrinsic::Popcount32(prefixMsk[laneIdx] & Intrinsic::LanemaskLt(laneIdx));
+
+					if (connectMsk[laneIdx] && electedPrefix == 0)
+					{
+						Intrinsic::AtomicOr(m_nextConnectMsk.data() + coarseVid, connectMsk[laneIdx]);
+					}
+				}
+			}
+	}
+
+	void NextLevelCluster(int level)
+	{
+		const int levelNum = m_levelSize[level].x;
+
+		//printf("nextLevelClusters %d %d\n", levelNum, m_levelSize[level].y);
+
+		const int warpDim = 32;
+
+		int numWarps = (m_numVerts + warpDim - 1) / warpDim;
+
+		int maxWarpIdx = (levelNum + warpDim - 1) / warpDim;
+
+		OMP_PARALLEL_FOR
+
+			for (int warpIdx = 0; warpIdx < numWarps; ++warpIdx)
+			{
+				unsigned int cachedMsk[32];
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * 32;
+
+					if (vid >= (levelNum + 31) / 32 * 32)
+					{
+						break;// early culling
+					}
+
+					unsigned int connectMsk = (1U << laneIdx);
+
+					if (vid < levelNum)
+					{
+						connectMsk |= m_nextConnectMsk[vid];// connect to current 
+					}
+
+					cachedMsk[laneIdx] = connectMsk;
+
+					if (vid >= levelNum)
+					{
+						break;
+					}
+				}
+
+				Intrinsic::SyncWarp();
+
+				unsigned int prefixSum = 0;
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * 32;
+
+					if (vid >= levelNum || vid >= (levelNum + 31) / 32 * 32)
+					{
+						break;// early culling
+					}
+
+					unsigned int connectMsk = cachedMsk[laneIdx];
+
+					unsigned int visited = (1U << laneIdx);
+
+					while (true)
+					{
+						unsigned int todo = visited ^ connectMsk;
+
+						if (!todo)
+						{
+							break;
+						}
+
+						unsigned int nextVisist = Intrinsic::Ffs(todo) - 1;
+
+						visited |= (1U << nextVisist);
+
+						connectMsk |= cachedMsk[nextVisist];
+					}
+
+					m_nextConnectMsk[vid] = connectMsk;
+
+					unsigned int electedPrefix = Intrinsic::Popcount32(connectMsk & Intrinsic::LanemaskLt(laneIdx));
+
+					if (electedPrefix == 0)
+					{
+						prefixSum++;
+					}
+				}
+
+				if (warpIdx < maxWarpIdx)
+				{
+					m_nextPrefix[warpIdx] = prefixSum;
+				}
+			}
+	}
+
+	void PrefixSumLx(int level)
+	{
+		const int levelNum = m_levelSize[level].x;
+
+		const int levelBegin = m_levelSize[level].y;
+
+		unsigned int blockDim = 1024;
+
+		int nBlocks = (m_numVerts + blockDim - 1) / blockDim;
+
+		OMP_PARALLEL_FOR
+
+			for (int blockIdx = 0; blockIdx < nBlocks; ++blockIdx)
+			{
+				unsigned int warpPrefix[32];
+
+				unsigned int theBlockPrefix = 0;
+
+				unsigned int globalWarpOffset = blockIdx * 32;
+
+				for (int warpIdx = 0; warpIdx < blockIdx; ++warpIdx)
+				{
+					for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						unsigned int threadIdx = laneIdx + warpIdx * 32;
+
+						unsigned int vid = threadIdx + blockIdx * blockDim;
+
+						if (vid >= (levelNum + blockDim - 1) / blockDim * blockDim)
+						{
+							break;
+						}
+
+						theBlockPrefix += m_nextPrefix[threadIdx];
+					}
+				}
+
+				for (int warpIdx = 0; warpIdx < 32; ++warpIdx)
+				{
+					warpPrefix[warpIdx] = m_nextPrefix[globalWarpOffset + warpIdx];
+				}
+
+				Intrinsic::SyncBlock();
+
+				for (unsigned int warpIdx = 0; warpIdx < 32; ++warpIdx)
+				{
+					unsigned int theWarpPrefix = theBlockPrefix;
+
+					for (unsigned int prevWarpIdx = 0; prevWarpIdx < warpIdx; ++prevWarpIdx)
+					{
+						theWarpPrefix += warpPrefix[prevWarpIdx];
+					}
+
+					unsigned int electedMsk = 0;
+
+					for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						unsigned int threadIdx = laneIdx + warpIdx * 32;
+
+						int vid = threadIdx + blockIdx * blockDim;
+
+						if (vid >= levelNum)
+						{
+							break;
+						}
+						if (vid == levelNum - 1)
+						{
+							m_levelSize[level + 1].x = theWarpPrefix + warpPrefix[warpIdx];
+							m_levelSize[level + 1].y = levelBegin + (levelNum + 31) / 32 * 32;
+						}
+
+						unsigned int connMsk = m_nextConnectMsk[vid];
+
+						unsigned int electedPrefix = Intrinsic::Popcount32(connMsk & Intrinsic::LanemaskLt(laneIdx));
+
+						if (electedPrefix == 0)
+						{
+							electedMsk |= (1U << laneIdx);
+						}
+
+						//unsigned int electedMask = Intrinsic::BallotSync(-1, electedPrefix == 0);
+					}
+
+					unsigned int lanePrefix[32];
+
+					for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						lanePrefix[laneIdx] = Intrinsic::Popcount32(electedMsk & Intrinsic::LanemaskLt(laneIdx));
+						lanePrefix[laneIdx] += theWarpPrefix;
+					}
+
+					for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						unsigned int threadIdx = laneIdx + warpIdx * 32;
+
+						int vid = threadIdx + blockIdx * blockDim;
+
+						if (vid >= levelNum) { break; }
+
+						int electedLane = Intrinsic::Ffs(m_nextConnectMsk[vid]) - 1;
+
+						unsigned int theLanePrefix = lanePrefix[electedLane];
+
+						m_nextConnectMsk[vid] = theLanePrefix;
+
+						m_goingNext[vid + levelBegin] = theLanePrefix + levelBegin + (levelNum + 31) / 32 * 32;
+					}
+				}
+			}
+	}
+
+	void ComputeNextLevel(int level)
+	{
+		OMP_PARALLEL_FOR
+
+			for (int vid = 0; vid < m_numVerts; ++vid)
+			{
+				int next = m_CoarseSpaceTables[level - 1][vid];
+
+				m_CoarseSpaceTables[level][vid] = m_nextConnectMsk[next];
+			}
+	}
+
+	void TotalNodes()
+	{
+		m_totalNumberClusters = m_levelSize[m_numLevel].y;
+	}
+
+	void AggregationKernel()
+	{
+		const int warpDim = 32;
+
+		int numWarps = (m_numVerts + warpDim - 1) / warpDim;
+
+		OMP_PARALLEL_FOR
+
+			for (int warpIdx = 0; warpIdx < numWarps; ++warpIdx)
+			{
+				unsigned int firstInWarp = warpIdx * 32;
+
+				int curr[32];
+				int next[32];
+				int aggLevel[32];
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * 32;
+
+					if (vid >= m_numVerts) { break; }
+
+					curr[laneIdx] = vid;
+
+					aggLevel[laneIdx] = m_numLevel - 1;
+				}
+
+				Int4 coarseTable[32];
+
+				for (int l = 0; l < m_numLevel - 1; ++l)
+				{
+					for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32;
+
+						if (vid >= m_numVerts) { break; }
+
+						next[laneIdx] = m_goingNext[curr[laneIdx]];
+					}
+
+					Intrinsic::SyncWarp();
+
+					for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32;
+
+						if (vid >= m_numVerts) { break; }
+
+						if (next[laneIdx] == next[0])
+						{
+							aggLevel[laneIdx] = Math::Min(l, aggLevel[laneIdx]);
+						}
+
+						curr[laneIdx] = next[laneIdx];
+						coarseTable[laneIdx][l] = next[laneIdx];
+					}
+				}
+
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * 32;
+
+					if (vid >= m_numVerts) { break; }
+
+					m_denseLevel[vid] = aggLevel[laneIdx];
+
+					m_coarseTables[vid] = coarseTable[laneIdx];
+				}
+			}
+	}
+
+	void AdditionalSchwarzHessian2(SeMatrix3f hessian, std::vector<SeMatrix3f>& pAdditionalHessian, SeArray2D<SeMatrix3f>& pDenseHessian, int v1, int v2, const std::vector<int>& pGoingNext, int nLevel, int vNum)
+	{
+		int level = 0;
+		unsigned int myID = v1;
+		unsigned int otID = v2;
+		const unsigned int bank = 32;
+
+		while (myID / bank != otID / bank && level < nLevel)
+		{
+			myID = pGoingNext[myID];
+			otID = pGoingNext[otID];
+			level++;
+		}
+
+		if (level >= nLevel)
+			return;
+
+		Intrinsic::AtomicAdd(&pDenseHessian[otID % bank][myID], hessian);
+		Intrinsic::AtomicAdd(&pDenseHessian[myID % bank][otID], hessian);
+
+		if (level < nLevel - 1)
+		{
+			myID = pGoingNext[myID];
+			otID = pGoingNext[otID];
+
+			if (myID == otID)
+			{
+				Intrinsic::AtomicAdd(&pAdditionalHessian[myID], hessian * 2.0f);
+			}
+			else
+			{
+				Intrinsic::AtomicAdd(&pAdditionalHessian[myID], hessian);
+				Intrinsic::AtomicAdd(&pAdditionalHessian[otID], hessian);
+			}
+		}
+	}
+
+	void PrepareCollisionHessian()
+	{
+		OMP_PARALLEL_FOR
+
+			for (int i = 0; i < m_stencilNum; i++)
+			{
+				const auto& s = m_stencils[i];
+				auto idx = m_stencilIndexMapped[i];
+
+				Float3 d(s.direction[0], s.direction[1], s.direction[2]);
+
+				SeMatrix3f hessian = OuterProduct(d, d * s.stiff);
+
+				for (int it = 0; it < s.verextNumPerStencil; it++)
+				{
+					Intrinsic::AtomicAdd(&m_additionalHessian32[idx[it]], hessian * Math::Square(s.weight[it]));
+				}
+
+				for (int ita = 0; ita < s.verextNumPerStencil; ita++)
+				{
+					for (int itb = ita + 1; itb < s.verextNumPerStencil; itb++)
+					{
+						AdditionalSchwarzHessian2(s.weight[ita] * s.weight[itb] * hessian, m_additionalHessian32, m_hessian32, idx[ita], idx[itb], m_goingNext, m_numLevel, m_numVerts);
+					}
+				}
+			}
+	}
+
+	void PrepareHessian(const SeMatrix3f* diagonal, const SeMatrix3f* csrOffDiagonals, const int* csrRanges)
+	{
+		const unsigned int bank = 32;
+
+		int nVC = (m_numVerts + 31) / 32 * 32;
+
+		OMP_PARALLEL_FOR
+
+			for (int vid = nVC; vid < m_totalNumberClusters; ++vid)
+			{
+				auto oldDiagonal = m_additionalHessian32[vid];
+				int myID = vid;
+				Intrinsic::AtomicAdd(&m_hessian32[myID % bank][myID], oldDiagonal);
+				while (true)
+				{
+					myID = m_goingNext[myID];
+					if (myID >= m_totalNumberClusters)
+					{
+						break;
+					}
+					Intrinsic::AtomicAdd(&m_hessian32[myID % bank][myID], oldDiagonal);
+				}
+			}
+
+		int numWarps = (nVC + 31) / 32;
+
+		OMP_PARALLEL_FOR
+
+			for (int warpIdx = 0; warpIdx < numWarps; ++warpIdx)
+			{
+
+				/*__shared__*/ SeMatrix3f AtomicCache[10]; Utility::MemsetZero(AtomicCache, 10);
+
+				for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * 32;
+
+					if (vid >= nVC) { break; }
+
+					if (vid < m_numVerts)
+					{
+						const int denseLevel = m_denseLevel[vid];
+
+						const int vidOrig = m_MapperSortedGetOriginal[vid];
+
+						int oldNum = m_mappedNeighborsNum[vid];
+
+						auto oldDiagonal = diagonal[vidOrig] + m_additionalHessian32[vid];
+
+						m_hessian32[vid % bank][vid] += oldDiagonal;   // self diagonal
+
+						{
+							int level = 0;
+							int myID = vid;
+							while (level < denseLevel)
+							{
+								myID = m_goingNext[myID];
+								Intrinsic::AtomicAdd(&m_hessian32[myID % bank][myID], oldDiagonal);
+								//printf("%d %d", level, denseLevel);
+								level++;
+							}
+							Intrinsic::AtomicAdd(&AtomicCache[level], oldDiagonal);
+						}
+
+						for (int k = 1; k < oldNum; k++)
+						{
+							const unsigned int neighbor = m_mappedNeighbors[k][vid];
+
+							SeMatrix3f mat = csrOffDiagonals[csrRanges[vidOrig] + k - 1];
+
+							int level = 0;
+							unsigned int levelSz = m_numVerts;
+							unsigned int myID = vid;
+							unsigned int otID = neighbor;
+
+							while (myID / bank != otID / bank && level < m_numLevel)
+							{
+								level++;
+								myID = m_goingNext[myID];
+								otID = m_goingNext[otID];
+							}
+
+							if (level >= m_numLevel)
+							{
+								//printf("%d\n", level);
+								continue;
+							}
+
+
+							if (level == 0)
+								m_hessian32[otID % bank][myID] += mat;
+							else
+								Intrinsic::AtomicAdd(&m_hessian32[otID % bank][myID], mat);
+
+							while (level < denseLevel)
+							{
+								myID = m_goingNext[myID];
+								Intrinsic::AtomicAdd(&m_hessian32[myID % bank][myID], mat);
+								//printf("%d %d", level, denseLevel);
+								level++;
+							}
+							Intrinsic::AtomicAdd(&AtomicCache[level], mat);
+						}
+					}
+				}
+
+				Intrinsic::SyncWarp();
+
+				int vid = warpIdx * 32;
+
+				if (vid < nVC)
+				{
+					SeMatrix3f total(0.f);
+
+					unsigned int myID = vid;
+
+					for (int level = 0; level < m_numLevel - 1; level++)
+					{
+						total += AtomicCache[level];
+
+						myID = m_goingNext[myID];
+
+						Intrinsic::AtomicAdd(&m_hessian32[myID % bank][myID], total);
+					}
+				}
+			}
+	}
+
+	void LDLtInverse512()
+	{
+		int triSz = (1 + 96) * 96 / 2 + 16 * 3;
+
+		const int blockNum = m_totalSz / 32;
+
+		for (int block = 0; block < blockNum; block++)
+		{
+			float A[96][96] = {};
+			float B[96][96] = {};
+
+			for (int it = 0; it < 96; it++)
+			{
+				B[it][it] = 1.0f;
+			}
+			for (int x = 0; x < 32; x++)
+			{
+				for (int y = 0; y < 32; y++)
+				{
+					SeMatrix3f temp = m_hessian32[y][x + block * 32];
+
+					if (x == y && temp(0, 0) == 0.0f)
+					{
+						temp = SeMatrix3f::Identity();
+					}
+					for (int ii = 0; ii < 3; ii++)
+					{
+						for (int jj = 0; jj < 3; jj++)
+						{
+							A[x * 3 + ii][y * 3 + jj] = temp(ii, jj);
+						}
+					}
+				}
+			}
+			// ������Ԫ
+			for (int x = 0; x < 96; x++)
+			{
+				float diag = A[x][x];
+				for (int y = x + 1; y < 96; y++)
+				{
+					if (A[y][x] == 0.0f)
+						continue;
+					float rt = -A[y][x] / diag;
+					A[y][x] = 0.0f;
+
+					for (int xx = x + 1; xx < 96; xx++)
+					{
+						A[y][xx] += A[x][xx] * rt;
+					}
+
+					B[y][x] = rt;
+
+					for (int xx = 0; xx < x; xx++)
+					{
+						B[y][xx] += B[x][xx] * rt;
+					}
+				}
+			}
+			/*if (block == pi)
+			{
+				for (int x = 0; x < 96; x++)
+				{
+					for (int y = 0; y < 96; y++)
+					{
+						of3 << B[x][y] << " ";
+					}
+					of3 << std::endl;
+				}
+			}*/
+
+			// ������Ԫ
+			for (int x = 95; x >= 0; x--)
+			{
+				float diag = A[x][x];
+				for (int y = x - 1; y >= 0; y--)
+				{
+					if (A[y][x] == 0.0f)
+					{
+						continue;
+					}
+
+					float rt = -A[y][x] / diag;
+					A[y][x] = 0.0f;
+
+					for (int xx = x + 1; xx < 96; xx++)
+					{
+						A[y][xx] += A[x][xx] * rt;
+					}
+					for (int xx = 0; xx < 96; xx++)
+					{
+						B[y][xx] += B[x][xx] * rt;
+					}
+				}
+			}
+
+			for (int x = 0; x < 96; x++)
+			{
+				for (int y = 0; y < 96; y++)
+				{
+					B[x][y] *= 1.f / A[x][x];
+				}
+			}
+
+			//diagonal 
+			for (int x = 0; x < 32; x++)
+			{
+				float d0 = B[x * 3 + 0][x * 3 + 0];
+				float d1 = B[x * 3 + 1][x * 3 + 1];
+				float d2 = B[x * 3 + 2][x * 3 + 2];
+
+				m_invSymR[block * triSz + x + 0] = d0;
+				m_invSymR[block * triSz + x + 32] = d1;
+				m_invSymR[block * triSz + x + 64] = d2;
+			}
+
+			// upper triangle
+			for (int y = 0; y < 16; y++)
+			{
+				for (int x = 0; x < 32; x++)
+				{
+					float value = 0.f;
+					if (x <= y)
+					{
+						if (y != 15)
+							value = B[31 - y + x][x];
+					}
+					else
+						value = B[x][x - y - 1];
+
+					int bank = y / 4;
+					int off = y % 4;
+
+					m_invSymR[block * triSz + 96 + x * 4 + off + bank * 32 * 4] = value;
+				}
+			}
+
+			// mid
+			for (int y = 0; y < 32; y++)
+			{
+				for (int x = 0; x < 32; x++)
+				{
+					float value = B[x + 32][y + x];
+					int bank = y / 4;
+					int off = y % 4;
+
+					m_invSymR[block * triSz + 96 + 512 + x * 4 + off + bank * 32 * 4] = value;
+				}
+			}
+
+			for (int y = 0; y < 16; y++)
+			{
+				for (int x = 0; x < 32; x++)
+				{
+					float value = 0.f;
+					if (x <= y)
+					{
+						if (y != 15)
+							value = B[63 - y + x][x];
+					}
+					else
+					{
+						value = B[x + 32][x - y - 1];
+					}
+
+					int bank = y / 4;
+					int off = y % 4;
+
+					m_invSymR[block * triSz + 96 + 512 + 1024 + x * 4 + off + bank * 32 * 4] = value;
+				}
+			}
+
+			// down
+			for (int y = 0; y < 64; y++)
+			{
+				for (int x = 0; x < 32; x++)
+				{
+					float value = B[x + 64][y + x];
+					int bank = y / 4;
+					int off = y % 4;
+
+					m_invSymR[block * triSz + 96 + 2048 + x * 4 + off + bank * 32 * 4] = value;
+				}
+			}
+
+			for (int y = 0; y < 16; y++)
+			{
+				for (int x = 0; x < 32; x++)
+				{
+					float value = 0.f;
+					if (x <= y)
+					{
+						if (y != 15)
+							value = B[95 - y + x][x];
+					}
+					else
+					{
+						value = B[x + 64][x - y - 1];
+					}
+					int bank = y / 4;
+					int off = y % 4;
+
+					m_invSymR[block * triSz + 96 + 4096 + x * 4 + off + bank * 32 * 4] = value;
+				}
+			}
+		}
+	}
+
+	void BuildResidualHierarchy(const SeVec3fSimd* m_cgResidual)
+	{
+		int numWarp = (m_numVerts + 31) / 32;
+
+		OMP_PARALLEL_FOR
+
+			for (int warpIdx = 0; warpIdx < numWarp; ++warpIdx)
+			{
+				unsigned int vidLead = warpIdx * 32;
+
+				unsigned int activeMsk = 0;
+
+				for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+				{
+					int vid = laneIdx + warpIdx * 32;
+
+					activeMsk |= vid < m_numVerts ? (1U << laneIdx) : (0U);
+				}
+
+				bool isFullConnected = (m_fineConnectMask[vidLead] == -1 && activeMsk == -1);
+
+				if (isFullConnected)
+				{
+					SeVec3fSimd sumR(0.f);
+
+					for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32;
+
+						if (vid >= m_numVerts) { break; }
+
+						unsigned int vidOrig = m_MapperSortedGetOriginal[vid];
+
+						SeVec3fSimd r = m_cgResidual[vidOrig];
+
+						m_mappedR[vid] = r;
+
+						sumR += r;
+					}
+
+					for (int levelIdx = 0; levelIdx < m_numLevel - 1; ++levelIdx)
+					{
+						int tb = m_coarseTables[vidLead][levelIdx];
+
+						Intrinsic::AtomicAdd(&m_mappedR[tb], sumR);
+					}
+				}
+				else
+				{
+					/*__shared__*/	SeVec3fSimd c_sumResidual[32]; Utility::MemsetZero(c_sumResidual, 32);
+
+					for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32;
+
+						if (vid >= m_numVerts) { break; }
+
+						unsigned int vidOrig = m_MapperSortedGetOriginal[vid];
+
+						SeVec3fSimd r = m_cgResidual[vidOrig];
+
+						m_mappedR[vid] = r;
+
+						//====
+
+						unsigned int connectMsk = m_fineConnectMask[vid];
+
+						int elected_lane = Intrinsic::Ffs(connectMsk) - 1;
+
+						Intrinsic::AtomicAdd(&c_sumResidual[elected_lane], r);
+					}
+
+					Intrinsic::SyncWarp();
+
+					for (unsigned int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32;
+
+						if (vid >= m_numVerts) { break; }
+
+						unsigned int connectMsk = m_fineConnectMask[vid];
+
+						unsigned int electedPrefix = Intrinsic::Popcount32(connectMsk & Intrinsic::LanemaskLt(laneIdx));
+
+						if (electedPrefix == 0)
+						{
+							const SeVec3fSimd& r = c_sumResidual[laneIdx];
+
+							const Int4& table = m_coarseTables[vid];
+
+							for (int it = 1; it < Math::Min(m_numLevel, 4); it++)
+							{
+								int now = table[it - 1];
+
+								Intrinsic::AtomicAdd(&m_mappedR[now], r);
+							}
+						}
+					}
+				}
+			}
+	}
+
+	void SchwarzLocalXSym()
+	{
+		constexpr int blockDim = 128;
+
+		int nBlocks = (m_totalSz + blockDim - 1) / blockDim;
+
+		const int triSz = (1 + 96) * 96 / 2 + 16 * 3;
+
+		OMP_PARALLEL_FOR
+
+			for (int blockIdx = 0; blockIdx < nBlocks; ++blockIdx)
+			{
+				float cacheRhs[4][96];
+				float cacheOut[4][96];
+
+				for (int warpIdx = 0; warpIdx < 4; ++warpIdx)
+				{
+					float* const cRhs = cacheRhs[warpIdx];
+					float* const cOut = cacheOut[warpIdx];
+
+					float* pMatrix[32];
+
+					for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32 + blockIdx * blockDim;
+
+						if (vid >= m_totalNumberClusters) { break; }
+
+						const auto globalWarpIdx = vid / 32;
+
+						pMatrix[laneIdx] = m_invSymR.data() + triSz * globalWarpIdx;
+
+						// diagonal 
+
+						float d0 = pMatrix[laneIdx][laneIdx + 0];
+						float d1 = pMatrix[laneIdx][laneIdx + 32];
+						float d2 = pMatrix[laneIdx][laneIdx + 64];
+
+						auto rhs = m_mappedR[vid];
+
+						cRhs[laneIdx * 3 + 0] = rhs.x;
+						cRhs[laneIdx * 3 + 1] = rhs.y;
+						cRhs[laneIdx * 3 + 2] = rhs.z;
+						cOut[laneIdx * 3 + 0] = rhs.x * d0;
+						cOut[laneIdx * 3 + 1] = rhs.y * d1;
+						cOut[laneIdx * 3 + 2] = rhs.z * d2;
+
+						pMatrix[laneIdx] += 32 * 3;
+					}
+
+					Intrinsic::SyncWarp();
+
+					Float4 prefetchValue4[32];
+
+					float singularRHS[32];
+					float singularSum[32];
+					float normalRHS[32];
+					float normalSum[32];
+
+					for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32 + blockIdx * blockDim;
+
+						if (vid >= m_totalNumberClusters) { break; }
+
+						singularRHS[laneIdx] = cRhs[laneIdx];
+
+						singularSum[laneIdx] = 0.f;
+
+						//------------------------------------------------
+						// upper tri
+
+						normalRHS[laneIdx] = cRhs[laneIdx];
+						normalSum[laneIdx] = 0.f;
+
+						prefetchValue4[laneIdx] = ((const Float4*)pMatrix[laneIdx])[laneIdx];
+
+						for (int bankIdx = 0; bankIdx < 4; bankIdx++)
+						{
+							Float4 value = ((const Float4*)pMatrix[laneIdx])[laneIdx + bankIdx * 32 + 32];
+
+							int offset = bankIdx * 4;
+
+							for (int y = offset; y < offset + 4; y++)
+							{
+								Intrinsic::SyncWarp();
+
+								float prefetchValue = prefetchValue4[laneIdx][y - offset];
+
+								if (laneIdx <= y)
+								{
+									if (y != 15)
+									{
+										float otherRhs = cRhs[31 - y + laneIdx];
+										cOut[31 - y + laneIdx] += prefetchValue * singularRHS[laneIdx];
+										singularSum[laneIdx] += prefetchValue * otherRhs;
+									}
+								}
+								else
+								{
+									float otherRhs = cRhs[laneIdx - y - 1];
+									cOut[laneIdx - y - 1] += prefetchValue * normalRHS[laneIdx];
+									normalSum[laneIdx] += prefetchValue * otherRhs;
+								}
+							}
+
+							prefetchValue4[laneIdx] = value;
+						}
+					}
+
+					Intrinsic::SyncWarp();
+
+					for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32 + blockIdx * blockDim;
+
+						if (vid >= m_totalNumberClusters) { break; }
+
+						pMatrix[laneIdx] += 16 * 32;
+
+						cOut[laneIdx] += normalSum[laneIdx];
+						//------------------------------------------------
+						// mid trapezoid 
+
+						normalRHS[laneIdx] = cRhs[laneIdx + 32];
+						normalSum[laneIdx] = 0.f;
+
+						for (int bank = 0; bank < 8; bank++)
+						{
+							Float4 value = ((const Float4*)pMatrix[laneIdx])[laneIdx + bank * 32 + 32];
+							for (int y = bank * 4; y < bank * 4 + 4; y++)
+							{
+								Intrinsic::SyncWarp();
+
+								float prefetchValue = prefetchValue4[laneIdx][y - bank * 4];
+								float otherRhs = cRhs[y + laneIdx];
+
+								cOut[y + laneIdx] += prefetchValue * normalRHS[laneIdx];
+								normalSum[laneIdx] += prefetchValue * otherRhs;
+							}
+							prefetchValue4[laneIdx] = value;
+						}
+
+						pMatrix[laneIdx] += 32 * 32;
+
+						for (int bankIdx = 0; bankIdx < 4; bankIdx++)
+						{
+							Float4 value = ((const Float4*)pMatrix[laneIdx])[laneIdx + bankIdx * 32 + 32];
+
+							for (int y = bankIdx * 4; y < bankIdx * 4 + 4; y++)
+							{
+								Intrinsic::SyncWarp();
+
+								float prefetchValue = prefetchValue4[laneIdx][y - bankIdx * 4];
+
+								if (laneIdx <= y)
+								{
+									if (y != 15)
+									{
+										float otherRhs = cRhs[63 - y + laneIdx];
+										cOut[63 - y + laneIdx] += prefetchValue * singularRHS[laneIdx];
+										singularSum[laneIdx] += prefetchValue * otherRhs;
+									}
+								}
+								else
+								{
+									float otherRhs = cRhs[laneIdx - y - 1];
+									cOut[laneIdx - y - 1] += prefetchValue * normalRHS[laneIdx];
+									normalSum[laneIdx] += prefetchValue * otherRhs;
+								}
+							}
+							prefetchValue4[laneIdx] = value;
+						}
+						pMatrix[laneIdx] += 32 * 16;
+					}
+
+					Intrinsic::SyncWarp();
+
+					for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32 + blockIdx * blockDim;
+
+						if (vid >= m_totalNumberClusters) { break; }
+
+						cOut[laneIdx + 32] += normalSum[laneIdx];
+						//------------------------------------------------
+						// lower trapezoid 
+						normalRHS[laneIdx] = cRhs[laneIdx + 64];
+						normalSum[laneIdx] = 0.f;
+						normalRHS[laneIdx] = cRhs[laneIdx + 64];
+						normalSum[laneIdx] = 0.f;
+
+						for (int bank = 0; bank < 16; bank++)
+						{
+							Float4 value = ((const Float4*)pMatrix[laneIdx])[laneIdx + bank * 32 + 32];
+
+							for (int y = bank * 4; y < bank * 4 + 4; y++)
+							{
+								Intrinsic::SyncWarp();
+
+								float prefetchValue = prefetchValue4[laneIdx][y - bank * 4];
+								float otherRhs = cRhs[y + laneIdx];
+
+								cOut[y + laneIdx] += prefetchValue * normalRHS[laneIdx];
+								normalSum[laneIdx] += prefetchValue * otherRhs;
+							}
+
+							prefetchValue4[laneIdx] = value;
+						}
+
+						pMatrix[laneIdx] += 32 * 64;
+
+						for (int bank = 0; bank < 4; bank++)
+						{
+							Float4 value(0.0f);
+
+							if (bank != 3)
+								value = ((const Float4*)pMatrix[laneIdx])[laneIdx + bank * 32 + 32];
+
+							for (int y = bank * 4; y < bank * 4 + 4; y++)
+							{
+								Intrinsic::SyncWarp();
+
+								float prefetchValue = prefetchValue4[laneIdx][y - bank * 4];
+
+								if (laneIdx <= y)
+								{
+									if (y != 15)
+									{
+										float otherRhs = cRhs[95 - y + laneIdx];
+										cOut[95 - y + laneIdx] += prefetchValue * singularRHS[laneIdx];
+										singularSum[laneIdx] += prefetchValue * otherRhs;
+									}
+								}
+								else
+								{
+									float otherRhs = cRhs[laneIdx - y - 1];
+									cOut[laneIdx - y - 1] += prefetchValue * normalRHS[laneIdx];
+									normalSum[laneIdx] += prefetchValue * otherRhs;
+								}
+							}
+
+							prefetchValue4[laneIdx] = value;
+						}
+					}
+
+					Intrinsic::SyncWarp();
+
+					for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32 + blockIdx * blockDim;
+
+						if (vid >= m_totalNumberClusters) { break; }
+
+						cOut[laneIdx + 64] += normalSum[laneIdx];
+						cOut[laneIdx] += singularSum[laneIdx];
+					}
+
+					Intrinsic::SyncWarp();
+
+					for (int laneIdx = 0; laneIdx < 32; ++laneIdx)
+					{
+						int vid = laneIdx + warpIdx * 32 + blockIdx * blockDim;
+
+						if (vid >= m_totalNumberClusters) { break; }
+
+						SeVec3fSimd out(0.0f);
+						out.x = cacheOut[warpIdx][laneIdx * 3 + 0];
+						out.y = cacheOut[warpIdx][laneIdx * 3 + 1];
+						out.z = cacheOut[warpIdx][laneIdx * 3 + 2];
+
+						m_mappedZ[vid] = out;
+					}
+				}
+			}
+	}
+
+	void CollectFinalZ(SeVec3fSimd* m_cgZ)
+	{
+		for (int vid = 0; vid < m_numVerts; ++vid)
+		{
+			unsigned int mappedIndex = m_MapperSortedGetOriginal[vid];
+
+			auto z = m_mappedZ[vid];
+
+			Int4 table = m_coarseTables[vid];
+
+			for (int l = 1; l < Math::Min(m_numLevel, 4); l++)
+			{
+				int now = table[l - 1];
+
+				z += m_mappedZ[now];
+			}
+
+			m_cgZ[mappedIndex] = z;
+		}
+	}
+};
+
+
+SE_NAMESPACE_END
\ No newline at end of file
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeUtility.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeUtility.h
new file mode 100644
index 0000000..3ec1d84
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeUtility.h
@@ -0,0 +1,112 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "SePreDefine.h"
+
+SE_NAMESPACE_BEGIN
+
+namespace Utility
+{
+	template <typename Type>
+	size_t ByteSize(const std::vector<Type> & data)
+	{
+		return sizeof(Type) * data.size();
+	}
+
+	template <typename Type>
+	void Memcpy(Type * dstData, const Type * srcData, size_t size)
+	{
+		std::memcpy(dstData, srcData, size * sizeof(Type));
+	}
+
+	template <typename Type1, typename Type2>
+	void Memcpy(std::vector<Type1> & data, const std::vector<Type2> & srcData, size_t size)
+	{
+		/*if(ByteSize(data) < ByteSize(srcData))
+		{
+			SE_WARNING_LOG("DesSize is less than Src Size!");
+		}*/
+		std::memcpy(data.data(), srcData.data(), SE_MIN(SE_MIN(ByteSize(data), ByteSize(srcData)), size * sizeof(Type1)));
+	}
+
+	template <typename Type>
+	void Memset(std::vector<Type>& data, const Type & value)
+	{
+		std::fill(data.begin(), data.end(), value);
+	}
+
+	template <typename Type>
+	void Memset(Type* data, const Type& value, size_t size)
+	{
+		std::fill(data, data + size, value);
+	}
+
+	template <typename Type>
+	void MemsetZero(std::vector<Type> & data)
+	{
+		std::memset(data.data(), 0, ByteSize(data));
+	}
+
+	template <typename Type>
+	void MemsetZero(Type * data, size_t size)
+	{
+		std::memset(data, 0, sizeof(Type) * size);
+	}
+
+	template <typename Type>
+	void MemsetMinusOne(std::vector<Type> & data)
+	{
+		std::memset(data.data(), -1, ByteSize(data));
+	}
+
+	template <typename Type>
+	void MemsetMinusOne(Type * data, size_t size)
+	{
+		std::memset(data, -1, sizeof(Type) * size);
+	}
+
+	template <typename Type>
+	void ClearAndShrink(std::vector<Type> & data)
+	{
+		data.clear(); data.shrink_to_fit();
+	}
+
+	template <typename Type>
+	void ResizeAndShrink(std::vector<Type> & data, size_t dim)
+	{
+		data.resize(dim); data.shrink_to_fit();
+	}
+
+	template <typename Type>
+	void CopyAndShrink(std::vector<Type> & desData, const std::vector<Type> & srcData)
+	{
+		desData = srcData; desData.shrink_to_fit();
+	}
+}
+
+SE_NAMESPACE_END
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeVector.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeVector.h
new file mode 100644
index 0000000..671b9f5
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeVector.h
@@ -0,0 +1,433 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "SePreDefine.h"
+
+#include <cmath>
+
+SE_NAMESPACE_BEGIN
+
+#define SE_VEC2_ALIGN(Type)		SE_ALIGN(SE_MIN(sizeof(Type) * 2, 16))		//!	Maximum align bits -> 64.
+#define SE_VEC4_ALIGN(Type)		SE_ALIGN(SE_MAX(sizeof(Type) * 4, 16))		//!	Maximum align bits -> 64.
+
+/*************************************************************************
+*****************************    SDVector    *****************************
+*************************************************************************/
+
+enum VecDescription : int { SIMD_VEC3 = -1, VEC2_XY = 2, VEC3_XYZ = 3, VEC4_XYZW = 4 };
+
+template<typename Type, int N> struct SeVector;
+template<typename Type> using SeVector2 = SeVector<Type, VEC2_XY>;
+template<typename Type> using SeVector3 = SeVector<Type, VEC3_XYZ>;
+template<typename Type> using SeVector4 = SeVector<Type, VEC4_XYZW>;
+
+/*************************************************************************
+*****************************    SDVector    *****************************
+*************************************************************************/
+
+template<typename Type, int N> struct SeVector
+{
+	static constexpr int Elements = N;
+
+	Type m_data[N];
+	
+	 SeVector() 
+	{
+		for (int i = 0; i < N; ++i)		m_data[i] = 0;
+	}
+
+	 SeVector(Type scalar)
+	{
+		for (int i = 0; i < N; ++i)		m_data[i] = scalar;
+	}
+
+	 const Type & operator[](unsigned int i) const { SE_ASSERT(i < Elements); return m_data[i]; }
+
+	 Type & operator[](unsigned int i) { SE_ASSERT(i < Elements); return m_data[i]; }
+
+	 SeVector operator+(const SeVector & a) const 
+	{ 
+		SeVector ans;
+		for (int i = 0; i < N; ++i)		
+			ans[i] = m_data[i] + a[i];
+		return ans;
+	}
+	 SeVector operator-(const SeVector & a) const
+	{
+		SeVector ans;
+		for (int i = 0; i < N; ++i)
+			ans[i] = m_data[i] - a[i];
+		return ans;
+	}
+
+	 SeVector operator*(const Type scalar) const
+	{
+		SeVector ans;
+		for (int i = 0; i < N; ++i)
+			ans[i] = m_data[i] * scalar;
+		return ans;
+	}
+	 SeVector operator/(const Type scalar) const
+	{
+		Type invScalar = Type(1) / scalar;
+		SeVector ans;
+		for (int i = 0; i < N; ++i)
+			ans[i] = m_data[i] * invScalar;
+		return ans;
+	}
+
+	 void operator+=(const SeVector & a) 
+	{
+		for (int i = 0; i < N; ++i)
+			m_data[i] += a[i];
+	}
+
+	 void operator-=(const SeVector & a)
+	{
+		for (int i = 0; i < N; ++i)
+			m_data[i] -= a[i];
+	}
+
+	friend  SeVector operator*(Type scalar, const SeVector & a) { return a * scalar; }
+
+	 Type Dot(const SeVector<Type, N> & rhs) const
+	{
+		Type sum = 0;
+		for (int i = 0; i < N; ++i)
+			sum += m_data[i] * rhs.values[i];
+		return sum;
+	}
+
+	 Type Sum() const
+	{
+		Type sum = 0;
+		for (int i = 0; i < N; ++i)
+			sum += m_data[i];
+		return sum;
+	}
+
+	 Type SqrLength()const
+	{
+		Type sum = 0;
+		for (int i = 0; i < N; ++i)
+			sum += m_data[i] * m_data[i];
+		return sum;
+	}
+
+	 Type Length()const
+	{
+		return std::sqrt(SqrLength());
+	}
+
+	 SeVector<Type, N> Normalize()const
+	{
+		return (*this) / Length();
+	}
+
+	 SeVector<Type, N> & NormalizeLocal()
+	{
+		(*this) /= Length();		return *this;
+	}
+};
+
+/*************************************************************************
+****************************    SDVector2    *****************************
+*************************************************************************/
+
+template<typename Type> struct SE_VEC2_ALIGN(Type) SeVector<Type, VEC2_XY>
+{
+	static constexpr int Elements = 2;
+
+	using value_type = Type;
+
+	union
+	{
+		struct { Type x, y; };
+		struct { Type values[2]; };
+	};
+
+	 SeVector() {}
+	 SeVector(Type s1, Type s2) : x(s1), y(s2) {}
+	 SeVector(Type scalar) : x(scalar), y(scalar) {}
+
+	 void operator+=(Type scalar) { x += scalar; y += scalar; }
+	 void operator-=(Type scalar) { x -= scalar; y -= scalar; }
+	 void operator*=(Type scalar) { x *= scalar; y *= scalar; }
+	 void operator/=(Type scalar) { x /= scalar; y /= scalar; }
+
+	 void operator+=(const SeVector & a) { x += a.x; y += a.y; }
+	 void operator-=(const SeVector & a) { x -= a.x; y -= a.y; }
+	 void operator*=(const SeVector & a) { x *= a.x; y *= a.y; }
+	 void operator/=(const SeVector & a) { x /= a.x; y /= a.y; }
+
+	 SeVector operator+(Type scalar) const { return SeVector(x + scalar, y + scalar); }
+	 SeVector operator-(Type scalar) const { return SeVector(x - scalar, y - scalar); }
+	 SeVector operator*(Type scalar) const { return SeVector(x * scalar, y * scalar); }
+	 SeVector operator/(Type scalar) const { return SeVector(x / scalar, y / scalar); }
+
+	 SeVector operator+(const SeVector & a) const { return SeVector(x + a.x, y + a.y); }
+	 SeVector operator-(const SeVector & a) const { return SeVector(x - a.x, y - a.y); }
+	 SeVector operator*(const SeVector & a) const { return SeVector(x * a.x, y * a.y); }
+	 SeVector operator/(const SeVector & a) const { return SeVector(x / a.x, y / a.y); }
+
+	friend  SeVector operator+(Type scalar, const SeVector & a) { return SeVector(scalar + a.x, scalar + a.y); }
+	friend  SeVector operator-(Type scalar, const SeVector & a) { return SeVector(scalar - a.x, scalar - a.y); }
+	friend  SeVector operator*(Type scalar, const SeVector & a) { return SeVector(scalar * a.x, scalar * a.y); }
+	friend  SeVector operator/(Type scalar, const SeVector & a) { return SeVector(scalar / a.x, scalar / a.y); }
+
+	template<typename OtherType>  explicit SeVector(const SeVector2<OtherType> & vec2) : x(static_cast<Type>(vec2.x)), y(static_cast<Type>(vec2.y)) {}
+	template<typename OtherType>  explicit SeVector(const SeVector3<OtherType> & vec3) : x(static_cast<Type>(vec3.x)), y(static_cast<Type>(vec3.y)) {}
+	template<typename OtherType>  explicit SeVector(const SeVector4<OtherType> & vec4) : x(static_cast<Type>(vec4.x)), y(static_cast<Type>(vec4.y)) {}
+
+	 const Type & operator[](unsigned int i) const { SE_ASSERT(i < Elements); return values[i]; }
+	 Type & operator[](unsigned int i) { SE_ASSERT(i < Elements); return values[i]; }
+};
+
+/*************************************************************************
+****************************    SDVector3    *****************************
+*************************************************************************/
+
+template<typename Type> struct SeVector<Type, VEC3_XYZ>
+{
+	static constexpr int Elements = 3;
+
+	using value_type = Type;
+
+	union
+	{
+		struct { Type x, y, z; };
+		struct { Type values[3]; };
+	};
+
+	 SeVector() {}
+	 SeVector(Type scalar) : x(scalar), y(scalar), z(scalar) {}
+	 SeVector(Type s1, Type s2, Type s3) : x(s1), y(s2), z(s3) {}
+	 explicit SeVector(Type s1, const SeVector2<Type> & vec2) : x(s1), y(vec2.x), z(vec2.y) {}
+	 explicit SeVector(const SeVector2<Type> & vec2, Type s3) : x(vec2.x), y(vec2.y), z(s3) {}
+
+	 void operator+=(const SeVector & a) { x += a.x; y += a.y; z += a.z; }
+	 void operator-=(const SeVector & a) { x -= a.x; y -= a.y; z -= a.z; }
+	 void operator*=(const SeVector & a) { x *= a.x; y *= a.y; z *= a.z; }
+	 void operator/=(const SeVector & a) { x /= a.x; y /= a.y; z /= a.z; }
+
+	 void operator+=(Type scalar) { x += scalar; y += scalar; z += scalar; }
+	 void operator-=(Type scalar) { x -= scalar; y -= scalar; z -= scalar; }
+	 void operator*=(Type scalar) { x *= scalar; y *= scalar; z *= scalar; }
+	 void operator/=(Type scalar) { x /= scalar; y /= scalar; z /= scalar; }
+
+	 SeVector operator+(const SeVector & a) const { return SeVector(x + a.x, y + a.y, z + a.z); }
+	 SeVector operator-(const SeVector & a) const { return SeVector(x - a.x, y - a.y, z - a.z); }
+	 SeVector operator*(const SeVector & a) const { return SeVector(x * a.x, y * a.y, z * a.z); }
+	 SeVector operator/(const SeVector & a) const { return SeVector(x / a.x, y / a.y, z / a.z); }
+
+	 SeVector operator+(Type scalar) const { return SeVector(x + scalar, y + scalar, z + scalar); }
+	 SeVector operator-(Type scalar) const { return SeVector(x - scalar, y - scalar, z - scalar); }
+	 SeVector operator*(Type scalar) const { return SeVector(x * scalar, y * scalar, z * scalar); }
+	 SeVector operator/(Type scalar) const { return SeVector(x / scalar, y / scalar, z / scalar); }
+
+	friend  SeVector operator+(Type scalar, const SeVector & a) { return SeVector(scalar + a.x, scalar + a.y, scalar + a.z); }
+	friend  SeVector operator-(Type scalar, const SeVector & a) { return SeVector(scalar - a.x, scalar - a.y, scalar - a.z); }
+	friend  SeVector operator*(Type scalar, const SeVector & a) { return SeVector(scalar * a.x, scalar * a.y, scalar * a.z); }
+	friend  SeVector operator/(Type scalar, const SeVector & a) { return SeVector(scalar / a.x, scalar / a.y, scalar / a.z); }
+
+	template<typename OtherType>  explicit SeVector(const SeVector2<OtherType> & vec2) : x(static_cast<Type>(vec2.x)), y(static_cast<Type>(vec2.y)), z(static_cast<Type>(0)) {}
+	template<typename OtherType>  explicit SeVector(const SeVector3<OtherType> & vec3) : x(static_cast<Type>(vec3.x)), y(static_cast<Type>(vec3.y)), z(static_cast<Type>(vec3.z)) {}
+	template<typename OtherType>  explicit SeVector(const SeVector4<OtherType> & vec4) : x(static_cast<Type>(vec4.x)), y(static_cast<Type>(vec4.y)), z(static_cast<Type>(vec4.z)) {}
+
+	 const Type & operator[](unsigned int i) const { SE_ASSERT(i < Elements); return values[i]; }
+	 Type & operator[](unsigned int i) { SE_ASSERT(i < Elements); return values[i]; }
+};
+
+/*************************************************************************
+****************************    SDVector4    *****************************
+*************************************************************************/
+
+template<typename Type> struct SE_VEC4_ALIGN(Type) SeVector<Type, VEC4_XYZW>
+{
+	static constexpr int Elements = 4;
+
+	using value_type = Type;
+
+	union
+	{
+		struct { Type values[4]; };
+		struct { Type x, y, z, w; };
+		struct { SeVector3<Type> xyz; };
+	};
+
+	 SeVector() {}
+	 SeVector(Type scalar) : x(scalar), y(scalar), z(scalar), w(scalar) {}
+	 SeVector(Type s1, Type s2, Type s3, Type s4) : x(s1), y(s2), z(s3), w(s4) {}
+	 explicit SeVector(Type s1, Type s2, const SeVector2<Type> & vec2) : x(s1), y(s2), z(vec2.x), w(vec2.y) {}
+	 explicit SeVector(Type s1, const SeVector2<Type> & vec2, Type s4) : x(s1), y(vec2.x), z(vec2.y), w(s4) {}
+	 explicit SeVector(const SeVector2<Type> & vec2, Type s3, Type s4) : x(vec2.x), y(vec2.y), z(s3), w(s4) {}
+	 explicit SeVector(const SeVector2<Type> & vec2a, const SeVector2<Type> & vec2b) : x(vec2a.x), y(vec2a.y), z(vec2b.x), w(vec2b.y) {}
+	 explicit SeVector(const SeVector3<Type> & vec3, Type s4) : x(vec3.x), y(vec3.y), z(vec3.z), w(s4) {}
+	 explicit SeVector(Type s1, const SeVector3<Type> & vec3) : x(s1), y(vec3.x), z(vec3.y), w(vec3.z) {}
+	
+	 void operator+=(const SeVector3<Type> & a) { x += a.x; y += a.y; z += a.z; }
+	 void operator-=(const SeVector3<Type> & a) { x -= a.x; y -= a.y; z -= a.z; }
+	 void operator*=(const SeVector3<Type> & a) { x *= a.x; y *= a.y; z *= a.z; }
+	 void operator/=(const SeVector3<Type> & a) { x /= a.x; y /= a.y; z /= a.z; }
+
+	 void operator+=(const SeVector & a) { x += a.x; y += a.y; z += a.z; w += a.w; }
+	 void operator-=(const SeVector & a) { x -= a.x; y -= a.y; z -= a.z; w -= a.w; }
+	 void operator*=(const SeVector & a) { x *= a.x; y *= a.y; z *= a.z; w *= a.w; }
+	 void operator/=(const SeVector & a) { x /= a.x; y /= a.y; z /= a.z; w /= a.w; }
+
+	 void operator+=(Type scalar) { x += scalar; y += scalar; z += scalar; w += scalar; }
+	 void operator-=(Type scalar) { x -= scalar; y -= scalar; z -= scalar; w -= scalar; }
+	 void operator*=(Type scalar) { x *= scalar; y *= scalar; z *= scalar; w *= scalar; }
+	 void operator/=(Type scalar) { x /= scalar; y /= scalar; z /= scalar; w /= scalar; }
+
+	 SeVector operator+(const SeVector & a) const { return SeVector(x + a.x, y + a.y, z + a.z, w + a.w); }
+	 SeVector operator-(const SeVector & a) const { return SeVector(x - a.x, y - a.y, z - a.z, w - a.w); }
+	 SeVector operator*(const SeVector & a) const { return SeVector(x * a.x, y * a.y, z * a.z, w * a.w); }
+	 SeVector operator/(const SeVector & a) const { return SeVector(x / a.x, y / a.y, z / a.z, w / a.w); }
+
+	 SeVector operator+(const SeVector3<Type> & a) const { return SeVector(x + a.x, y + a.y, z + a.z, w); }
+	 SeVector operator-(const SeVector3<Type> & a) const { return SeVector(x - a.x, y - a.y, z - a.z, w); }
+	 SeVector operator*(const SeVector3<Type> & a) const { return SeVector(x * a.x, y * a.y, z * a.z, w); }
+	 SeVector operator/(const SeVector3<Type> & a) const { return SeVector(x / a.x, y / a.y, z / a.z, w); }
+
+	 SeVector operator+(Type scalar) const { return SeVector(x + scalar, y + scalar, z + scalar, w + scalar); }
+	 SeVector operator-(Type scalar) const { return SeVector(x - scalar, y - scalar, z - scalar, w - scalar); }
+	 SeVector operator*(Type scalar) const { return SeVector(x * scalar, y * scalar, z * scalar, w * scalar); }
+	 SeVector operator/(Type scalar) const { return SeVector(x / scalar, y / scalar, z / scalar, w / scalar); }
+
+	friend  SeVector operator+(Type scalar, const SeVector & a) { return SeVector(scalar + a.x, scalar + a.y, scalar + a.z, scalar + a.w); }
+	friend  SeVector operator-(Type scalar, const SeVector & a) { return SeVector(scalar - a.x, scalar - a.y, scalar - a.z, scalar - a.w); }
+	friend  SeVector operator*(Type scalar, const SeVector & a) { return SeVector(scalar * a.x, scalar * a.y, scalar * a.z, scalar * a.w); }
+	friend  SeVector operator/(Type scalar, const SeVector & a) { return SeVector(scalar / a.x, scalar / a.y, scalar / a.z, scalar / a.w); }
+
+	template<typename OtherType>  explicit SeVector(const SeVector2<OtherType> & vec2) : x(static_cast<Type>(vec2.x)), y(static_cast<Type>(vec2.y)), z(static_cast<Type>(0)), w(static_cast<Type>(0)) {}
+	template<typename OtherType>  explicit SeVector(const SeVector3<OtherType> & vec3) : x(static_cast<Type>(vec3.x)), y(static_cast<Type>(vec3.y)), z(static_cast<Type>(vec3.z)), w(static_cast<Type>(0)) {}
+	template<typename OtherType>  explicit SeVector(const SeVector4<OtherType> & vec4) : x(static_cast<Type>(vec4.x)), y(static_cast<Type>(vec4.y)), z(static_cast<Type>(vec4.z)), w(static_cast<Type>(vec4.w)) {}
+
+	 const Type & operator[](unsigned int i) const { SE_ASSERT(i < Elements); return values[i]; }
+	 Type & operator[](unsigned int i) { SE_ASSERT(i < Elements); return values[i]; }
+};
+
+/*************************************************************************
+****************************    Operators    *****************************
+*************************************************************************/
+
+
+template<typename Type, int N> SE_INLINE bool operator==(const SeVector<Type, N> & a, const SeVector<Type, N> & b)
+{
+	for (int i = 0; i < SeVector<Type, N>::Elements; ++i)		{ if (a[i] != b[i])	return false; }			return true;
+}
+template<typename Type, int N> SE_INLINE bool operator!=(const SeVector<Type, N> & a, const SeVector<Type, N> & b)
+{
+	for (int i = 0; i < SeVector<Type, N>::Elements; ++i)		{ if (a[i] != b[i])	return true; }			return false;
+}
+template<typename Type, int N> SE_INLINE SeVector<Type, N> operator-(SeVector<Type, N> a)
+{
+	for (int i = 0; i < SeVector<Type, N>::Elements; ++i)		a[i] = -a[i];			return a;
+}
+template<typename Type, int N> SE_INLINE SeVector<Type, N> operator!(SeVector<Type, N> a)
+{
+	for (int i = 0; i < SeVector<Type, N>::Elements; ++i)		a[i] = !a[i];			return a;
+}
+template<typename Type, int N> SE_INLINE SeVector<Type, N> operator~(SeVector<Type, N> a)
+{
+	for (int i = 0; i < SeVector<Type, N>::Elements; ++i)		a[i] = ~a[i];			return a;
+}
+
+
+/*************************************************************************
+***************************    Type defines    ***************************
+*************************************************************************/
+// bool
+typedef SeVector2<bool>				Bool2;
+// char
+typedef SeVector2<char>				Char2;
+typedef SeVector3<char>				Char3;
+typedef SeVector4<char>				Char4;
+typedef SeVector<char, 5>			Char5;
+typedef SeVector<char, 6>			Char6;
+// unsign char
+typedef SeVector2<unsigned char>	UChar2;
+typedef SeVector3<unsigned char>	UChar3;
+typedef SeVector4<unsigned char>	UChar4;
+typedef SeVector<unsigned char, 5>	UChar5;
+typedef SeVector<unsigned char, 6>	UChar6;
+// short
+typedef SeVector2<short>			Short2;
+typedef SeVector3<short>			Short3;
+typedef SeVector4<short>			Short4;
+typedef SeVector<short, 5>			Short5;
+typedef SeVector<short, 6>			Short6;
+// unsigned short
+typedef SeVector2<unsigned short>	UShort2;
+typedef SeVector3<unsigned short>	UShort3;
+typedef SeVector4<unsigned short>	UShort4;
+typedef SeVector<unsigned short, 5> UShort5;
+typedef SeVector<unsigned short, 6> UShort6;
+// int
+typedef SeVector2<int>				Int2;
+typedef SeVector3<int>				Int3;
+typedef SeVector4<int>				Int4;
+typedef SeVector<int, 5>			Int5;
+typedef SeVector<int, 6>			Int6;
+typedef SeVector<int, 8>			Int8;
+typedef SeVector<int, 12>			Int12;
+typedef SeVector<int, 30>			Int30;
+// unsigned int
+typedef SeVector2<unsigned int>		UInt2;
+typedef SeVector3<unsigned int>		UInt3;
+typedef SeVector4<unsigned int>		UInt4;
+typedef SeVector<unsigned int, 5>	UInt5;
+typedef SeVector<unsigned int, 6>	UInt6;
+// float
+typedef SeVector2<float>			Float2;
+typedef SeVector3<float>			Float3;
+typedef SeVector4<float>			Float4;
+typedef SeVector<float, 5>			Float5;
+typedef SeVector<float, 6>			Float6;
+typedef SeVector<float, 9>			Float9;
+typedef SeVector<float, 12>			Float12;
+// double
+typedef SeVector2<double>			Double2;
+typedef SeVector3<double>			Double3;
+typedef SeVector4<double>			Double4;
+typedef SeVector<double, 5>			Double5;
+typedef SeVector<double, 6>			Double6;
+// size_t
+typedef SeVector2<size_t>			Size2;
+typedef SeVector3<size_t>			Size3;
+typedef SeVector4<size_t>			Size4;
+typedef SeVector<size_t, 5>			Size5;
+typedef SeVector<size_t, 6>			Size6;
+/*
+// SIMD-float3
+typedef SeVector<float, SIMD_VEC3>	Simd3f;*/
+
+typedef SeVector3<short>			Half3;	//just for compile reason,don't use it 
+
+SE_NAMESPACE_END
\ No newline at end of file
diff --git a/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeVectorSimd.h b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeVectorSimd.h
new file mode 100644
index 0000000..66b06b9
--- /dev/null
+++ b/thirdparty/preconditioner-for-cloth-and-deformable-body-simulation/SeVectorSimd.h
@@ -0,0 +1,156 @@
+/////////////////////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2002 - 2022,
+//  All rights reserved.
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//     2. Redistributions in binary form must reproduce the above copyright
+//        notice, this list of conditions and the following disclaimer in the
+//        documentation and/or other materials provided with the distribution.
+//     3. The names of its contributors may not be used to endorse or promote
+//        products derived from this software without specific prior written
+//        permission.
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+//  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "SeVector.h"
+
+#include <cmath>
+
+#ifdef WIN32
+#include <xmmintrin.h>
+#endif
+
+
+SE_NAMESPACE_BEGIN
+
+
+#ifdef WIN32
+
+template<typename Type> using SimdVector3 = SeVector<Type, SIMD_VEC3>;
+
+template<> struct SeVector<float, SIMD_VEC3>
+{
+	static constexpr int Elements = 3;
+
+	using value_type = float;
+
+	union
+	{
+		struct { __m128 pack; };
+		struct { float x, y, z, w; };
+		struct { SeVector3<float> xyz; };
+		struct { SeVector4<float> xyzw; };
+	};
+
+	SeVector() {}
+	SeVector(const __m128 & p) : pack(p) {}
+	explicit SeVector(float scalar) : pack(_mm_setr_ps(scalar, scalar, scalar, 0.0f)) {}	//pack(_mm_set_ps1(scalar)) {}
+	SeVector(float s1, float s2, float s3) : pack(_mm_setr_ps(s1, s2, s3, 0.0f)) {}
+	SeVector(float s1, float s2, float s3, float s4) : pack(_mm_setr_ps(s1, s2, s3, s4)) {}
+	explicit SeVector(const SeVector<float, VEC3_XYZ> & vec) : pack(_mm_setr_ps(vec.x, vec.y, vec.z, 0.f)) {}
+
+	void operator+=(const SeVector & a) { pack = _mm_add_ps(pack, a.pack); }
+	void operator-=(const SeVector & a) { pack = _mm_sub_ps(pack, a.pack); }
+	void operator*=(const SeVector & a) { pack = _mm_mul_ps(pack, a.pack); }
+	void operator/=(const SeVector & a) { pack = _mm_div_ps(pack, a.pack); }
+
+	void operator+=(const __m128 & a) { pack = _mm_add_ps(pack, a); }
+	void operator-=(const __m128 & a) { pack = _mm_sub_ps(pack, a); }
+	void operator*=(const __m128 & a) { pack = _mm_mul_ps(pack, a); }
+	void operator/=(const __m128 & a) { pack = _mm_div_ps(pack, a); }
+
+	void operator+=(float scalar) { pack = _mm_add_ps(pack, _mm_set_ps1(scalar)); }
+	void operator-=(float scalar) { pack = _mm_sub_ps(pack, _mm_set_ps1(scalar)); }
+	void operator*=(float scalar) { pack = _mm_mul_ps(pack, _mm_set_ps1(scalar)); }
+	void operator/=(float scalar) { pack = _mm_div_ps(pack, _mm_set_ps1(scalar)); }
+
+	SeVector operator+(const SeVector & a) const { return SeVector(_mm_add_ps(pack, a.pack)); }
+	SeVector operator-(const SeVector & a) const { return SeVector(_mm_sub_ps(pack, a.pack)); }
+	SeVector operator*(const SeVector & a) const { return SeVector(_mm_mul_ps(pack, a.pack)); }
+	SeVector operator/(const SeVector & a) const { return SeVector(_mm_div_ps(pack, a.pack)); }
+
+	SeVector operator+(const __m128 & a) const { return SeVector(_mm_add_ps(pack, a)); }
+	SeVector operator-(const __m128 & a) const { return SeVector(_mm_sub_ps(pack, a)); }
+	SeVector operator*(const __m128 & a) const { return SeVector(_mm_mul_ps(pack, a)); }
+	SeVector operator/(const __m128 & a) const { return SeVector(_mm_div_ps(pack, a)); }
+
+	SeVector operator+(float scalar) const { return SeVector(_mm_add_ps(pack, _mm_set_ps1(scalar))); }
+	SeVector operator-(float scalar) const { return SeVector(_mm_sub_ps(pack, _mm_set_ps1(scalar))); }
+	SeVector operator*(float scalar) const { return SeVector(_mm_mul_ps(pack, _mm_set_ps1(scalar))); }
+	SeVector operator/(float scalar) const { return SeVector(_mm_div_ps(pack, _mm_set_ps1(scalar))); }
+
+	friend SeVector operator+(float scalar, const SeVector & a) { return SeVector(_mm_add_ps(_mm_set_ps1(scalar), a.pack)); }
+	friend SeVector operator-(float scalar, const SeVector & a) { return SeVector(_mm_sub_ps(_mm_set_ps1(scalar), a.pack)); }
+	friend SeVector operator*(float scalar, const SeVector & a) { return SeVector(_mm_mul_ps(_mm_set_ps1(scalar), a.pack)); }
+	friend SeVector operator/(float scalar, const SeVector & a) { return SeVector(_mm_div_ps(_mm_set_ps1(scalar), a.pack)); }
+
+	const float & operator[](unsigned int i) const { SE_ASSERT(i < 4); return pack.m128_f32[i]; }
+	float & operator[](unsigned int i) { SE_ASSERT(i < 4); return pack.m128_f32[i]; }
+};
+
+
+/*************************************************************************
+****************************    Operators    *****************************
+*************************************************************************/
+
+template<typename Type> SE_INLINE bool operator==(const SimdVector3<Type> & a, const SimdVector3<Type> & b)
+{
+	for (int i = 0; i < SimdVector3<Type>::Elements; ++i)		{ if (a[i] != b[i])	return false; }			return true;
+}
+template<typename Type> SE_INLINE bool operator!=(const SimdVector3<Type> & a, const SimdVector3<Type> & b)
+{
+	for (int i = 0; i < SimdVector3<Type>::Elements; ++i)		{ if (a[i] != b[i])	return true; }			return false;
+}
+template<typename Type> SE_INLINE SimdVector3<Type> operator-(const SimdVector3<Type> & a)
+{
+	for (int i = 0; i < SimdVector3<Type>::Elements; ++i)		a[i] = -a[i];			return a;
+}
+template<typename Type> SE_INLINE SimdVector3<Type> operator!(const SimdVector3<Type> & a)
+{
+	for (int i = 0; i < SimdVector3<Type>::Elements; ++i)		a[i] = !a[i];			return a;
+}
+template<typename Type> SE_INLINE SimdVector3<Type> operator~(const SimdVector3<Type> & a)
+{
+	for (int i = 0; i < SimdVector3<Type>::Elements; ++i)		a[i] = ~a[i];			return a;
+}
+
+template<> SE_INLINE SimdVector3<float> operator - (const SimdVector3<float> & a)
+{
+	return _mm_xor_ps(a.pack, _mm_set1_ps(-0.f));
+}
+
+/*
+template<> SE_INLINE SeVector<float, SIMD_VEC3> operator - (SeVector<float, SIMD_VEC3> a)
+{
+	return _mm_xor_ps(a.pack, _mm_set1_ps(-0.f));
+}*/
+
+#endif
+
+
+/*************************************************************************
+***************************    Type defines    ***************************
+*************************************************************************/
+
+
+using Simd3f = SimdVector3<float>;
+template<typename Type> using SimdVector3 = SeVector<Type, SIMD_VEC3>;
+
+using SeVec3fSimd = Simd3f;
+
+
+SE_NAMESPACE_END